import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.ticker import AutoMinorLocator
from matplotlib import gridspec
#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
#kmeans, dbscan, hierarchical (sklearn)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
#evaluation
from sklearn.metrics import silhouette_score
from sklearn import metrics
#distance matrix (dbscan elbow, hierarchical)
from scipy.spatial.distance import pdist, squareform
# hierarchical (scipy)
from scipy.cluster.hierarchy import linkage, dendrogram
#seaborn #bokeh #altair
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from google.colab import files
!pip install pyfim
from fim import apriori
Collecting pyfim
Downloading pyfim-6.28.tar.gz (357 kB)
|████████████████████████████████| 357 kB 13.9 MB/s
Building wheels for collected packages: pyfim
Building wheel for pyfim (setup.py) ... done
Created wheel for pyfim: filename=pyfim-6.28-cp37-cp37m-linux_x86_64.whl size=537786 sha256=7f0c078240f9aada9e12e1e542a02d2fa17a14ed48ebaebe13981e6baf197a47
Stored in directory: /root/.cache/pip/wheels/08/9f/26/09cb4efd027e46f96e0a0f33d0a74be614d3caf89c1eeb75a8
Successfully built pyfim
Installing collected packages: pyfim
Successfully installed pyfim-6.28
#caricamento del dataset
df = pd.read_csv('words_glasgow.csv')
#faccio una copia del dataset in caso di manipolazione dati
dfcopy= df.copy()
#visualizzazione di alcune righe per avere un'idea dei dati
#le prime righe
df.head()
| word | length | arousal | valence | dominance | concreteness | imageability | familiarity | aoa | semsize | gender | polysemy | web_corpus_freq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | abattoir | 8 | 4.200 | 2.864 | 4.333 | 5.455 | 4.391 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 160074.0 |
| 1 | abbey | 5 | 3.125 | 5.781 | 4.667 | 5.906 | 5.344 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 4224864.0 |
| 2 | abbreviate | 10 | 3.273 | 5.250 | 5.235 | 3.286 | 3.177 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 140105.0 |
| 3 | abdicate | 8 | 4.194 | 3.767 | 4.419 | 3.367 | 2.516 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 124123.0 |
| 4 | abdication | 10 | 3.846 | 3.880 | 4.800 | 3.292 | 2.571 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 128143.0 |
#le ultime righe
df.tail()
| word | length | arousal | valence | dominance | concreteness | imageability | familiarity | aoa | semsize | gender | polysemy | web_corpus_freq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4677 | zeppelin | 8 | 6.185 | 5.000 | 5.333 | 6.286 | 6.185 | 3.167 | 6.036 | 5.464 | 5.269 | 0 | 3192943.0 |
| 4678 | zero | 4 | 4.031 | 4.182 | 4.567 | 2.688 | 3.903 | 6.269 | 2.636 | 1.758 | 4.172 | 0 | 30735412.0 |
| 4679 | zest | 4 | 5.969 | 6.818 | 6.121 | 4.438 | 4.033 | 4.000 | 5.364 | 2.636 | 3.452 | 0 | 655010.0 |
| 4680 | zoo | 3 | 5.909 | 6.235 | 5.485 | 6.118 | 6.441 | 5.655 | 2.324 | 4.844 | 4.059 | 0 | 11589578.0 |
| 4681 | zoology | 7 | 4.571 | 5.765 | 5.114 | 4.429 | 3.914 | 4.382 | 5.824 | 4.571 | 3.657 | 0 | 1672374.0 |
#visualizzazione della dimensione
df.shape
(4682, 13)
#prima ricognizione indicativa sui dati
df.describe()
| length | arousal | valence | dominance | concreteness | imageability | familiarity | aoa | semsize | gender | polysemy | web_corpus_freq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4.668000e+03 |
| mean | 6.348355 | 4.678129 | 5.086797 | 5.044939 | 4.566273 | 4.723018 | 5.271335 | 4.143427 | 4.136403 | 4.099933 | 0.080948 | 2.988976e+07 |
| std | 2.006230 | 1.097163 | 1.594344 | 0.930669 | 1.433689 | 1.363110 | 0.921218 | 1.252770 | 1.023293 | 0.912293 | 0.272785 | 8.490144e+07 |
| min | 2.000000 | 2.057000 | 1.030000 | 1.941000 | 1.636000 | 1.737000 | 1.647000 | 1.219000 | 1.375000 | 1.000000 | 0.000000 | 1.277000e+04 |
| 25% | 5.000000 | 3.849000 | 4.115000 | 4.529000 | 3.242000 | 3.519250 | 4.706000 | 3.114000 | 3.438000 | 3.606000 | 0.000000 | 1.671100e+06 |
| 50% | 6.000000 | 4.571000 | 5.290000 | 5.123000 | 4.471000 | 4.677000 | 5.438000 | 4.177000 | 4.186500 | 4.121000 | 0.000000 | 5.702982e+06 |
| 75% | 8.000000 | 5.419000 | 6.088000 | 5.600000 | 5.971000 | 6.032000 | 5.969000 | 5.152000 | 4.882000 | 4.656000 | 0.000000 | 2.232705e+07 |
| max | 16.000000 | 8.177000 | 8.647000 | 8.371000 | 6.938000 | 6.941000 | 6.939000 | 6.971000 | 6.912000 | 6.971000 | 1.000000 | 2.022460e+09 |
df.describe(include=['O'])
| word | |
|---|---|
| count | 4682 |
| unique | 4682 |
| top | toward |
| freq | 1 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4682 entries, 0 to 4681 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 word 4682 non-null object 1 length 4682 non-null int64 2 arousal 4682 non-null float64 3 valence 4682 non-null float64 4 dominance 4682 non-null float64 5 concreteness 4682 non-null float64 6 imageability 4682 non-null float64 7 familiarity 4682 non-null float64 8 aoa 4682 non-null float64 9 semsize 4682 non-null float64 10 gender 4682 non-null float64 11 polysemy 4682 non-null int64 12 web_corpus_freq 4668 non-null float64 dtypes: float64(10), int64(2), object(1) memory usage: 475.6+ KB
In this dataset there are 4682 unique words. Each word is examined in regards of different variables. Those variables are lenght, arousal, valence, dominance, concretness, imageability, familiarity, age of aquisition, semsize gender, polysemy and frequence in a google web corpus.`
testo in corsivo## 1.1 Data Semantics
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
first=df.loc[:,"familiarity":]
aoa_descr=first.describe()
print(aoa_descr.to_latex())
\begin{tabular}{lrrrrrr}
\toprule
{} & familiarity & aoa & semsize & gender & polysemy & web\_corpus\_freq \\
\midrule
count & 4682.000000 & 4682.000000 & 4682.000000 & 4682.000000 & 4682.000000 & 4.668000e+03 \\
mean & 5.271335 & 4.143427 & 4.136403 & 4.099933 & 0.080948 & 2.988976e+07 \\
std & 0.921218 & 1.252770 & 1.023293 & 0.912293 & 0.272785 & 8.490144e+07 \\
min & 1.647000 & 1.219000 & 1.375000 & 1.000000 & 0.000000 & 1.277000e+04 \\
25\% & 4.706000 & 3.114000 & 3.438000 & 3.606000 & 0.000000 & 1.671100e+06 \\
50\% & 5.438000 & 4.177000 & 4.186500 & 4.121000 & 0.000000 & 5.702982e+06 \\
75\% & 5.969000 & 5.152000 & 4.882000 & 4.656000 & 0.000000 & 2.232705e+07 \\
max & 6.939000 & 6.971000 & 6.912000 & 6.971000 & 1.000000 & 2.022460e+09 \\
\bottomrule
\end{tabular}
testo in grassetto
Brief description of the variables
#NOME VARIABILE DI INTERESSE
refvar="arousal"
#calcolo frequenza assoluta per aoa
df[refvar].value_counts()
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,10),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & arousal \\
\midrule
count & 4682.000000 \\
mean & 4.678129 \\
std & 1.097163 \\
min & 2.057000 \\
25\% & 3.849000 \\
50\% & 4.571000 \\
75\% & 5.419000 \\
max & 8.177000 \\
\bottomrule
\end{tabular}
Other comments
Brief description of the variables
#NOME VARIABILE DI INTERESSE
refvar="valence"
#calcolo frequenza assoluta per aoa
df[refvar].value_counts()
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,10),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & valence \\
\midrule
count & 4682.000000 \\
mean & 5.086797 \\
std & 1.594344 \\
min & 1.030000 \\
25\% & 4.115000 \\
50\% & 5.290000 \\
75\% & 6.088000 \\
max & 8.647000 \\
\bottomrule
\end{tabular}
Other comments
Brief description of the variables
refvar="dominance"
#calcolo frequenza assoluta per aoa
df[refvar].value_counts()
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,10),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & dominance \\
\midrule
count & 4682.000000 \\
mean & 5.044939 \\
std & 0.930669 \\
min & 1.941000 \\
25\% & 4.529000 \\
50\% & 5.123000 \\
75\% & 5.600000 \\
max & 8.371000 \\
\bottomrule
\end{tabular}
Other comments
(numerical): represents the degree to which something can be experienced by our senses. It ranges from 1, abstract words, to 7, concrete words.
#NOME VARIABILE DI INTERESSE
refvar="concreteness"
#calcolo frequenza assoluta per aoa
df[refvar].value_counts()
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & concreteness \\
\midrule
count & 4682.000000 \\
mean & 4.566273 \\
std & 1.433689 \\
min & 1.636000 \\
25\% & 3.242000 \\
50\% & 4.471000 \\
75\% & 5.971000 \\
max & 6.938000 \\
\bottomrule
\end{tabular}
Other comments
(numerical): measures how difficult is to generate a mental image of something. It ranges from 1, hard to imagine, to 7, easy to imagine.
#NOME VARIABILE DI INTERESSE
refvar="imageability"
#calcolo frequenza assoluta per aoa
df[refvar].value_counts()
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & imageability \\
\midrule
count & 4682.000000 \\
mean & 4.723018 \\
std & 1.363110 \\
min & 1.737000 \\
25\% & 3.519250 \\
50\% & 4.677000 \\
75\% & 6.032000 \\
max & 6.941000 \\
\bottomrule
\end{tabular}
Other comments
(numerical): is a measure of a word’s subjective experience. The range is from 1 (unfamiliar) to 7 (familiar).
#NOME VARIABILE DI INTERESSE
refvar="familiarity"
#calcolo frequenza assoluta per aoa
df[refvar].value_counts()
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
boxplot_str="boxplot"
str1="distribution of"+" "+refvar
str2=boxplot_str+" "+str1
plt.title(str2, size = 18)
aoa_boxplot.set_xlabel(refvar, rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title(str1, size = 18)
plt.xlabel(xlabel = refvar, fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & familiarity \\
\midrule
count & 4682.000000 \\
mean & 5.271335 \\
std & 0.921218 \\
min & 1.647000 \\
25\% & 4.706000 \\
50\% & 5.438000 \\
75\% & 5.969000 \\
max & 6.939000 \\
\bottomrule
\end{tabular}
Other comments
The variable age of aquisition (aoa) indicates the supposed age in which that person first learned that specific word. Clearly it is not easy to remember at which age a word is learned, and for that reason in this dataset this variables refears to an estimation of the age of aquisition. The scale is defined as a series of consecutive 2-year periods from the ages of 2 and 12 years, and a final period referring to 13 years and older. This shows how we have 7 different ranges, 0-2, 2-4, 4-6, 6-8, 8-10-, 10-12 and 13+.
#calcolo frequenza assoluta per aoa
df["aoa"].value_counts()
5.000 45
4.000 29
3.029 24
4.657 23
5.029 22
..
5.278 1
6.036 1
6.633 1
5.839 1
1.871 1
Name: aoa, Length: 953, dtype: int64
#imposto lo spazio
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 4]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
plt.subplot(1, 2, 1)
#distribuzione e tendenze centrali: boxplot
aoa_boxplot = sb.boxplot( x = "aoa", data=df, color = colors1[1])
plt.title("Boxplot distribution of age of aquisition", size = 18)
aoa_boxplot.set_xlabel("Age of aquisition", rotation = "horizontal", size = 16)
plt.subplot(1, 2, 2)
aoa1= df["aoa"].dropna(0, inplace = False)
#imposto la legenda (da rivedere)
#lables_aoa= [0-2, 2-4, 4-6, 6-8, 8-10, 10-12, 13]
#plt.legend(title="Age of aquisition ranges", title_fontsize = 14, bbox_to_anchor=(1, 1), labels=lables_aoa )
#distribuzione e tendenze centrali: istogramma
aoa_hist = plt.hist(aoa1, color = colors1[1],edgecolor='black',bins=range(0,8))
plt.title("Distribution of aoa", size = 18)
plt.xlabel(xlabel = "Age of Aquisition", fontsize = 16)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = "aoa", data=df, color = colors1[1])
plt.title("Boxplot distribution of age of aquisition", size = 18)
aoa_boxplot.set_xlabel("Age of aquisition", rotation = "horizontal", size = 16)
plt.show()
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = "aoa", data=df, color = colors1[1])
plt.title("Boxplot distribution of Age of Aquisition", size = 18)
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for i, x in zip(range(0,7), bin_centers):
# Label the raw counts
age_range=['0-2','2-4','4-6','6-8','8-10','10-12','13+']
ax.annotate(age_range[i], xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -22), textcoords='offset points',fontsize=16, va='top', ha='center')
aoa_boxplot.set_xlabel("Age of Aquisition", rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df["aoa"].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(0,8),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for i, x in zip(range(0,7), bin_centers):
# Label the raw counts
age_range=['0-2','2-4','4-6','6-8','8-10','10-12','13+']
ax.annotate(age_range[i], xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -22), textcoords='offset points',fontsize=16, va='top', ha='center')
# Label the percentages
#percent = '%0.0f%%' % (100 * float(count) / counts.sum())
#ax.annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
#xytext=(0, -32), textcoords='offset points', va='top', ha='center')
# Give ourselves some more room at the bottom of the plot
#plt.subplots_adjust(bottom=0.50)
plt.title("Distribution of aoa", size = 18)
plt.xlabel(xlabel = "Age of Aquisition", fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df["aoa"].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & aoa \\
\midrule
count & 4682.000000 \\
mean & 4.143427 \\
std & 1.252770 \\
min & 1.219000 \\
25\% & 3.114000 \\
50\% & 4.177000 \\
75\% & 5.152000 \\
max & 6.971000 \\
\bottomrule
\end{tabular}
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
refvar="aoa"
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
plt.title("Boxplot distribution of aoa", size = 18)
aoa_boxplot.set_xlabel("Age of Aquisition", rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for i, x in zip(range(0,7), bin_centers):
# Label the raw counts
age_range=['0-2','2-4','4-6','6-8','8-10','10-12','13+']
ax.annotate(age_range[i], xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -22), textcoords='offset points',fontsize=16, va='top', ha='center')
# Label the percentages
#percent = '%0.0f%%' % (100 * float(count) / counts.sum())
#ax.annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
#xytext=(0, -32), textcoords='offset points', va='top', ha='center')
# Give ourselves some more room at the bottom of the plot
plt.subplots_adjust(bottom=0.50)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title("Distribution of aoa", size = 18)
plt.xlabel(xlabel = "Age of Aquisition", fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & aoa \\
\midrule
count & 4682.000000 \\
mean & 4.143427 \\
std & 1.252770 \\
min & 1.219000 \\
25\% & 3.114000 \\
50\% & 4.177000 \\
75\% & 5.152000 \\
max & 6.971000 \\
\bottomrule
\end{tabular}
aoa_descr=df["aoa"].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & aoa \\
\midrule
count & 4682.000000 \\
mean & 4.143427 \\
std & 1.252770 \\
min & 1.219000 \\
25\% & 3.114000 \\
50\% & 4.177000 \\
75\% & 5.152000 \\
max & 6.971000 \\
\bottomrule
\end{tabular}
There are 4682 occurrences for this variable. That means that there are no missing values, or NaN. The mean is 4.14, and the standard deviation is 1.25.
In this dataset "size" is a measure of magnitude expressed in either concrete or abstarct terms (big, small). That is, if a word can be associated with adjectives like big or small (e.g. palace or mountain for concrete object, and knowledge or love for abstarct ideas).
#calcolo frequenza assoluta per aoa
df["semsize"].value_counts()
4.000 52
5.000 42
3.000 35
4.971 27
4.677 26
..
5.241 1
5.891 1
4.160 1
6.281 1
5.867 1
Name: semsize, Length: 939, dtype: int64
#imposto lo spazio
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 4]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
plt.subplot(1, 2, 1)
#distribuzione e tendenze centrali: boxplot
aoa_boxplot = sb.boxplot( x = "semsize", data=df, color = colors1[1])
plt.title("Boxplot distribution of semantic size", size = 18)
aoa_boxplot.set_xlabel("semantic size", rotation = "horizontal", size = 16)
plt.subplot(1, 2, 2)
size1= df["semsize"].dropna(0, inplace = False)
#imposto la legenda (da rivedere)
#lables_size= [0-2, 2-4, 4-6, 6-8, 8-10, 10-12, 13]
#plt.legend(title="Age of aquisition ranges", title_fontsize = 14, bbox_to_anchor=(1, 1), labels=lables_size )
#distribuzione e tendenze centrali: istogramma
aoa_hist = plt.hist(size1, color = colors1[1],edgecolor='black',bins=range(1,8))
#BELLURIE
plt.title("Distribution of semantic size", size = 18)
plt.xlabel(xlabel = "semantic size", fontsize = 16)
plt.ylabel(ylabel = "Degree", fontsize = 16)
sb.despine(right = True)
plt.show()
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
refvar="semsize"
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
plt.title("Boxplot distribution of semantic size", size = 18)
aoa_boxplot.set_xlabel("Semantic Size", rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title("Distribution of semsize", size = 18)
plt.xlabel(xlabel = "Semantic Size", fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & semsize \\
\midrule
count & 4682.000000 \\
mean & 4.136403 \\
std & 1.023293 \\
min & 1.375000 \\
25\% & 3.438000 \\
50\% & 4.186500 \\
75\% & 4.882000 \\
max & 6.912000 \\
\bottomrule
\end{tabular}
df["semsize"].describe()
count 4682.000000 mean 4.136403 std 1.023293 min 1.375000 25% 3.438000 50% 4.186500 75% 4.882000 max 6.912000 Name: semsize, dtype: float64
The count of occurrences for this variable suggests that there are no missing values. The mean is 4.13 and the standard deviation is just above 1. This results suggests that the tendency is that is more common to think that a word has a bigger semantic size.
Gender in this dataset refers to how strongly its meaning is associated with male or female behaviour or idea. This variable could be very interesting in regards of the social bias that might, or might not, be present.
#calcolo frequenza assoluta per aoa
df["gender"].value_counts()
4.000 86
5.000 39
3.971 32
4.500 29
4.029 28
..
3.129 1
4.861 1
1.743 1
4.080 1
6.588 1
Name: gender, Length: 1022, dtype: int64
#imposto lo spazio
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
plt.rcParams["figure.figsize"] = [16, 4]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
plt.subplot(1, 2, 1)
#distribuzione e tendenze centrali: boxplot
gender_boxplot = sb.boxplot( x = "gender", data=df, color = colors1[1])
plt.title("Boxplot distribution of perceived gender", size = 18)
gender_boxplot.set_xlabel("gender", rotation = "horizontal", size = 16)
plt.subplot(1, 2, 2)
gender1= df["gender"].dropna(0, inplace = False)
#imposto la legenda
lables_size= [0-2, 2-4, 4-6, 6-8, 8-10, 10-12, 13]
plt.legend(title="Perceived gender", title_fontsize = 14, bbox_to_anchor=(1, 1), labels=lables_size )
#distribuzione e tendenze centrali: istogramma
aoa_hist = plt.hist(gender1, color = colors1[1])
plt.title("Distribution of perceived gender", size = 18)
plt.xlabel(xlabel = "gender", fontsize = 16)
plt.ylabel(ylabel = "Degree", fontsize = 16)
sb.despine(right = True)
plt.show()
df["gender"].describe()
count 4682.000000 mean 4.099933 std 0.912293 min 1.000000 25% 3.606000 50% 4.121000 75% 4.656000 max 6.971000 Name: gender, dtype: float64
df["length"].describe()
count 4682.000000 mean 6.348355 std 2.006230 min 2.000000 25% 5.000000 50% 6.000000 75% 8.000000 max 16.000000 Name: length, dtype: float64
import matplotlib.ticker as mticker
colors1 = ['#e5f5f9','#99d8c9','#2ca25f']
refvar="gender"
#BOXPLOT
plt.rcParams["figure.figsize"] = [16, 8]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa_boxplot = sb.boxplot( x = refvar, data=df, color = colors1[1])
#bellurie
plt.title("Boxplot distribution of gender", size = 18)
aoa_boxplot.set_xlabel("Gender", rotation = "horizontal", size = 16)
#DISTRIBUTION
plt.rcParams["figure.figsize"] = [16, 12]
sb.set_context("notebook", font_scale=1.5, rc={"font.size":14,"axes.titlesize": 16,"axes.labelsize": 12})
aoa1= df[refvar].dropna(0, inplace = False)
data = aoa1
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(data,color = colors1[1], bins=range(1,8),edgecolor='black')
# Set the ticks to be at the edges of the bins.
ax.set_xticks(bins)
# Label the raw counts and the percentages below the x-axis...
#bellurie
plt.title("Distribution of gender", size = 18)
plt.xlabel(xlabel = "Gender", fontsize = 20, labelpad=20)
plt.ylabel(ylabel = "Occurences", fontsize = 16)
ax.xaxis.set_minor_locator(mticker.FixedLocator((1, 7)))
ax.xaxis.set_minor_formatter(mticker.FixedFormatter(("Label A", "Label B")))
plt.setp(ax.yaxis.get_minorticklabels(), size=15, va="center")
ax.tick_params("x",which="minor",pad=25, left=False)
sb.despine(right = True)
plt.show()
aoa_descr=df[refvar].describe()
print(aoa_descr.to_latex())
\begin{tabular}{lr}
\toprule
{} & gender \\
\midrule
count & 4682.000000 \\
mean & 4.099933 \\
std & 0.912293 \\
min & 1.000000 \\
25\% & 3.606000 \\
50\% & 4.121000 \\
75\% & 4.656000 \\
max & 6.971000 \\
\bottomrule
\end{tabular}
This variable is not self intuitive. There is no visible correlation from the number and the perceived gender of that word. A supposition is that the higher the value, the more "masculine" the word is perceived. For example the word "actor" has a perceived gender value of 5.588, where the max value is 6.971.
Also it seems that the distribution is heavly centered, leaving a significant amount of outliers, as seen in the figure above.
df[["word","gender"]].loc[[51, 52]]
| word | gender | |
|---|---|---|
| 51 | actor | 5.588 |
| 52 | actress | 1.303 |
Codice per le distribuzioni delle variabili 'arousal', 'valence', 'dominance', 'concreteness','imageability', 'familiarity'
df.corr() #matrice di correlazione
#distribuzione delle variabili
scaler = MinMaxScaler()
var = ['arousal', 'valence', 'dominance', 'concreteness',
'imageability', 'familiarity']
for e in var:
print(e)
Y = df[[e]].values
Y_minmax = scaler.fit_transform(Y)
plt.hist(Y_minmax, edgecolor='white')
plt.show()
#piechart polisemia
#percentuale delle polisemiche
m = 0
p = 0
for e in df['polysemy'].values:
if e == 1:
p += 1
if e == 0:
m += 1
p1 = p/(p+m)*100
m1 = m/(p+m)*100
print(p1, m1)
#grafico
labels = 'Polisemic', 'Monosemic'
sizes = [p1, m1]
explode = (0.1, 0) # only "explode" the 2nd slice (i.e. 'Hogs')
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
#matrice di correlazione
corr=df.corr()
plt.figure(figsize=(16, 6))
heatmap = sb.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
#sb.heatmap(corr, cmap="Blues", annot=True)
arousal
valence
dominance
concreteness
imageability
familiarity
8.094831268688594 91.9051687313114
df.isnull().sum()
word 0 length 0 arousal 0 valence 0 dominance 0 concreteness 0 imageability 0 familiarity 0 aoa 0 semsize 0 gender 0 polysemy 0 web_corpus_freq 14 dtype: int64
The dataset seems to be almost without null values. In fact, there are only 14 NaN, and all are concentrated in the "web_corpus_freq" variable.
df[df['web_corpus_freq'].isnull()]
| word | length | arousal | valence | dominance | concreteness | imageability | familiarity | aoa | semsize | gender | polysemy | web_corpus_freq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 585 | burgle | 6 | 5.118 | 2.303 | 3.656 | 4.970 | 5.424 | 5.200 | 3.735 | 4.697 | 5.333 | 0 | NaN |
| 753 | Christmas | 9 | 7.516 | 7.914 | 5.600 | 5.086 | 6.571 | 6.710 | 1.600 | 6.394 | 3.771 | 0 | NaN |
| 1070 | Dad | 3 | 4.912 | 6.849 | 4.618 | 6.257 | 6.400 | 6.853 | 1.265 | 5.147 | 6.706 | 0 | NaN |
| 1076 | Dame | 4 | 4.194 | 5.594 | 5.469 | 5.125 | 4.969 | 3.697 | 4.969 | 4.548 | 1.242 | 0 | NaN |
| 1540 | 8 | 4.971 | 4.857 | 4.486 | 5.943 | 6.229 | 6.829 | 6.314 | 5.114 | 4.171 | 0 | NaN | |
| 1559 | FALSE | 5 | 4.636 | 2.941 | 4.206 | 3.455 | 2.765 | 5.700 | 3.086 | 4.500 | 4.353 | 0 | NaN |
| 2673 | Mom | 3 | 5.667 | 7.936 | 4.813 | 6.424 | 6.250 | 6.594 | 1.333 | 5.094 | 1.097 | 0 | NaN |
| 2724 | Mum | 3 | 4.594 | 7.938 | 4.219 | 6.091 | 6.625 | 6.906 | 1.219 | 5.061 | 1.212 | 0 | NaN |
| 2726 | Mummy | 5 | 5.364 | 7.471 | 4.879 | 5.794 | 6.515 | 6.182 | 1.771 | 4.677 | 1.455 | 0 | NaN |
| 3773 | skijump | 7 | 5.914 | 5.771 | 5.486 | 6.200 | 6.529 | 4.758 | 5.028 | 5.389 | 5.000 | 0 | NaN |
| 4347 | TRUE | 4 | 5.743 | 7.914 | 6.219 | 2.529 | 2.719 | 6.156 | 2.400 | 5.424 | 3.182 | 0 | NaN |
| 4365 | TV | 2 | 4.824 | 5.706 | 4.559 | 6.677 | 6.857 | 6.706 | 2.206 | 3.333 | 4.629 | 0 | NaN |
| 4373 | 7 | 4.235 | 4.943 | 4.824 | 4.886 | 5.600 | 6.273 | 6.971 | 4.771 | 3.829 | 0 | NaN | |
| 4668 | yo-yo | 5 | 5.059 | 5.800 | 5.636 | 6.455 | 6.424 | 4.484 | 2.800 | 1.875 | 4.206 | 0 | NaN |
Looking at the values that are present in the dataset but are not present in the web corpus, we can spot some similarities. There are 3 recurrences of the same word but slightly different from one another: "Mom", "Mum" and "Mommy". Also the word "Dad" is not present in the corpus.
In addition, there are 3 words written with full capital letters: "FALSE", "TRUE", and "TV". If the corpus is case sensitive, that could be an explaination of why those words are not present.
Finally, there are 2 words of social media that are absent in the corpus: "Facebook" and "Twitter". This is a bit strange since those 2 words are really known.
#creating a copy of the df
df2 = df.copy()
#dropping some variables for boxplot visualzation
df3 = df2.drop(["word","web_corpus_freq", "polysemy","length"], axis=1)
df3.head()
| arousal | valence | dominance | concreteness | imageability | familiarity | aoa | semsize | gender | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.200 | 2.864 | 4.333 | 5.455 | 4.391 | 2.382 | 6.760 | 4.652 | 5.391 |
| 1 | 3.125 | 5.781 | 4.667 | 5.906 | 5.344 | 3.324 | 5.177 | 5.121 | 3.303 |
| 2 | 3.273 | 5.250 | 5.235 | 3.286 | 3.177 | 5.121 | 5.543 | 2.667 | 3.971 |
| 3 | 4.194 | 3.767 | 4.419 | 3.367 | 2.516 | 3.971 | 6.233 | 4.679 | 5.167 |
| 4 | 3.846 | 3.880 | 4.800 | 3.292 | 2.571 | 3.097 | 6.407 | 5.083 | 4.571 |
#displaying all boxplot in one plot
#defining outliers as a blue underscore circle
blue_circle = dict(markerfacecolor='blue', marker='o', markeredgecolor='white')
#defining fig, axs as plt.subplots. In the arguments we pass in that we want 1 row, and then the number of columns.
#The number of columns will be equal to the length of df.columns.
#Finally, we will set the figsize to 20 by 10.
fig, axs = plt.subplots(1, len(df3.columns), figsize=(20,10))
#defining the shape and color of the mean
mean_shape = dict(markerfacecolor='green', marker='D', markeredgecolor='green')
#creating a small for loop which will loop over each of our plot axes.
#Using the enumerate function here to keep track of i, our index value
for i, ax in enumerate(axs.flat):
#addong in our boxplot by calling upon ax.boxplot() and passing in df.iloc,
#which allows us to get columns by the index variable, i.
#We do this by creating square brackets, with a colon followed by a comma.
#This tells the iloc function to get all rows, and then we pass in the column index.
#Next we specify our outlier properties and pass in red_circle.
ax.boxplot(df3.iloc[:,i], flierprops=blue_circle, showmeans = True, meanprops =mean_shape, notch=True)
ax.set_title(df3.columns[i], fontsize=20, fontweight='bold')
ax.tick_params(axis='y', labelsize=14)
plt.tight_layout()
First, some variables had to be dropped to perform this analysys with boxplots. "word" is not a numerical variable, and for that reason is not used in this plot. "web_corpus-freq" and "polysemy" can be analised on their own.
The plot shows that some variables are well distributed, like "concretness" and "imageability", while others are not, like "length", "arousal" and "dominance". Later this issue will be addressed with more depth.
df["polysemy"].plot(kind="box", vert = False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8740c62590>
"polysemy" has only 2 possible values, 0 and 1, that are false and true. For that reason it doesn't have outliers.
df["polysemy"].value_counts().plot(kind="pie")
<matplotlib.axes._subplots.AxesSubplot at 0x7f87417ba110>
"web_corpus_freq" instead seems to have a lot of outliers, as shown below.
df["web_corpus_freq"].dropna(0, inplace = False).plot(kind="box", vert = False, logx=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8736941e90>
var = ['arousal', 'valence', 'dominance', 'concreteness',
'imageability', 'familiarity','semsize','gender','aoa']
for element in var:
for e in df[element].values:
if(type(e)!=np.float64):
print(type(e))
print("tutto ok fra")
tutto ok fra tutto ok fra tutto ok fra tutto ok fra tutto ok fra tutto ok fra tutto ok fra tutto ok fra tutto ok fra
#creating a copy of the df
df_errors = df.copy()
df_errors[["word", "length"]]
| word | length | |
|---|---|---|
| 0 | abattoir | 8 |
| 1 | abbey | 5 |
| 2 | abbreviate | 10 |
| 3 | abdicate | 8 |
| 4 | abdication | 10 |
| ... | ... | ... |
| 4677 | zeppelin | 8 |
| 4678 | zero | 4 |
| 4679 | zest | 4 |
| 4680 | zoo | 3 |
| 4681 | zoology | 7 |
4682 rows × 2 columns
df_errors["word"] = df_errors.word.str.len()
df_errors[["word", "length"]]
| word | length | |
|---|---|---|
| 0 | 8 | 8 |
| 1 | 5 | 5 |
| 2 | 10 | 10 |
| 3 | 8 | 8 |
| 4 | 10 | 10 |
| ... | ... | ... |
| 4677 | 8 | 8 |
| 4678 | 4 | 4 |
| 4679 | 4 | 4 |
| 4680 | 3 | 3 |
| 4681 | 7 | 7 |
4682 rows × 2 columns
df_errors["word"].equals(df_errors["length"])
True
All values of the variable 'word' are correct in regards of their length
df.loc[df['polysemy']==1]
| word | length | arousal | valence | dominance | concreteness | imageability | familiarity | aoa | semsize | gender | polysemy | web_corpus_freq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 62 | address | 7 | 3.206 | 5.382 | 5.364 | 5.059 | 4.343 | 5.839 | 3.371 | 3.912 | 4.000 | 1 | 261872866.0 |
| 107 | aim | 3 | 4.794 | 6.606 | 6.829 | 2.677 | 2.941 | 5.909 | 3.857 | 4.206 | 4.265 | 1 | 28951240.0 |
| 188 | apple | 5 | 4.677 | 6.147 | 5.485 | 6.824 | 6.909 | 6.719 | 1.529 | 2.441 | 3.424 | 1 | 50551171.0 |
| 211 | arm | 3 | 3.735 | 5.471 | 5.774 | 6.727 | 6.571 | 6.546 | 1.457 | 3.171 | 4.265 | 1 | 23724057.0 |
| 216 | arms | 4 | 4.364 | 5.546 | 5.636 | 6.647 | 6.485 | 6.469 | 2.206 | 3.758 | 4.912 | 1 | 27432921.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4630 | wood | 4 | 3.000 | 5.206 | 5.265 | 6.727 | 6.606 | 6.303 | 2.629 | 3.727 | 5.235 | 1 | 51130555.0 |
| 4647 | wound | 5 | 5.156 | 2.706 | 3.667 | 5.758 | 5.853 | 5.500 | 4.118 | 3.706 | 5.118 | 1 | 5820050.0 |
| 4661 | yak | 3 | 3.552 | 4.807 | 4.700 | 5.219 | 4.742 | 2.829 | 4.897 | 3.833 | 4.710 | 1 | 1056994.0 |
| 4662 | yard | 4 | 2.788 | 5.171 | 4.771 | 5.829 | 5.343 | 4.177 | 3.771 | 4.286 | 4.457 | 1 | 15075593.0 |
| 4663 | yarn | 4 | 2.750 | 5.273 | 5.219 | 5.818 | 5.333 | 3.813 | 4.303 | 2.394 | 2.813 | 1 | 5113505.0 |
379 rows × 13 columns
An overview of the relation between the nine variables is provided in Fig. 6. Where a correlation greater than |0.6| is found, we plotted the values of the two variables for a better visualization (fig. 7). There is a strong correlation (0.91) between concreteness and imageability: it is difficult to imagine an abstract word and easier to imagine a concrete one. Moreover, concreteness and imageability relate to the other variables similarly, with a margin of +-0.14. Therefore we merged them into a new variable, perceivability. The values of perceivabilty are the mean of concreteness and imageability values. Other positively correlated variables are valence and dominance, with 0.72: the more valuable an item is perceived, the higher the degree of control over the object. Familiary and age of acquisition are instead negatively related: from the pairplot (Fig \ref{fig:pairplot}) is apparent that every word acquired in early age is highly familiar.
#matrice di correlazione
corr=df.corr()
plt.figure(figsize=(16, 6))
heatmap = sb.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
#sb.heatmap(corr, cmap="Blues", annot=True)
#creating a copy of the df
dfpp = df.copy()
#dropping some variables for boxplot visualzation
dfpp1 = dfpp.drop(["word", "polysemy","length", 'gender', 'web_corpus_freq', 'semsize', 'arousal'], axis=1)
sb.pairplot(dfpp1,
plot_kws=dict(marker=".", linewidth=1),
diag_kws=dict(fill=False),
corner=False
)
<seaborn.axisgrid.PairGrid at 0x7f8736531950>
sb.pairplot(dfpp1,
x_vars=['concreteness'],
y_vars=['imageability'],
plot_kws=dict(marker=".", linewidth=1),
diag_kws=dict(fill=False),
corner=False
)
#plt.savefig('pp_imageability_concreteness.png', dpi=300)
#files.download('pp_imageability_concreteness.png')
<seaborn.axisgrid.PairGrid at 0x7f87358cfe50>
pp_dv = sb.pairplot(dfpp1,
x_vars=['valence'],
y_vars=['dominance'],
plot_kws=dict(marker=".", linewidth=1),
diag_kws=dict(fill=False),
corner=False
)
#plt.savefig('pp_dominance_valence.png', dpi=300)
#files.download('pp_dominance_valence.png')
sb.pairplot(dfpp1,
x_vars=['familiarity'],
y_vars=['aoa'],
plot_kws=dict(marker=".", linewidth=1),
diag_kws=dict(fill=False),
corner=False
)
#plt.savefig('pp_aoa_familiarity.png', dpi=300)
#files.download('pp_aoa_familiarity.png')
<seaborn.axisgrid.PairGrid at 0x7f87358ff910>
df2["perceivability"] = df2[["imageability", "concreteness"]].mean(axis=1)
df2.head()
| word | length | arousal | valence | dominance | concreteness | imageability | familiarity | aoa | semsize | gender | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | abattoir | 8 | 4.200 | 2.864 | 4.333 | 5.455 | 4.391 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 160074.0 | 4.9230 |
| 1 | abbey | 5 | 3.125 | 5.781 | 4.667 | 5.906 | 5.344 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 4224864.0 | 5.6250 |
| 2 | abbreviate | 10 | 3.273 | 5.250 | 5.235 | 3.286 | 3.177 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 140105.0 | 3.2315 |
| 3 | abdicate | 8 | 4.194 | 3.767 | 4.419 | 3.367 | 2.516 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 124123.0 | 2.9415 |
| 4 | abdication | 10 | 3.846 | 3.880 | 4.800 | 3.292 | 2.571 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 128143.0 | 2.9315 |
df_perc=df2.drop(["concreteness","imageability"], axis=1)
df_perc.head()
| word | length | arousal | valence | dominance | familiarity | aoa | semsize | gender | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | abattoir | 8 | 4.200 | 2.864 | 4.333 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 160074.0 | 4.9230 |
| 1 | abbey | 5 | 3.125 | 5.781 | 4.667 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 4224864.0 | 5.6250 |
| 2 | abbreviate | 10 | 3.273 | 5.250 | 5.235 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 140105.0 | 3.2315 |
| 3 | abdicate | 8 | 4.194 | 3.767 | 4.419 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 124123.0 | 2.9415 |
| 4 | abdication | 10 | 3.846 | 3.880 | 4.800 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 128143.0 | 2.9315 |
dfprepro= df_perc.copy()
dfprepro=dfprepro.rename(columns={"gender": "masculinity"})
dfprepro.loc[(dfprepro['web_corpus_freq'].isnull() == True), 'web_corpus_freq'] = dfprepro['web_corpus_freq'].mean()
dfprepro[dfprepro['web_corpus_freq'].isnull()]
| word | length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability |
|---|
dfprepro["web_corpus_log"] = pd.qcut(dfprepro["web_corpus_freq"], 10) #taglio la variabile web_corpus_freq in 10 gruppi
print(dfprepro[["web_corpus_log", "web_corpus_freq"]].groupby(["web_corpus_log"], as_index=False).mean())
web_corpus_log web_corpus_freq 0 (12769.999, 575889.4] 3.097333e+05 1 (575889.4, 1242854.6] 9.041106e+05 2 (1242854.6, 2214291.0] 1.686212e+06 3 (2214291.0, 3597973.4] 2.849525e+06 4 (3597973.4, 5702981.5] 4.637392e+06 5 (5702981.5, 9168887.0] 7.342067e+06 6 (9168887.0, 16042883.5] 1.222427e+07 7 (16042883.5, 31729949.6] 2.273650e+07 8 (31729949.6, 69227170.1] 4.752152e+07 9 (69227170.1, 2022459848.0] 1.985905e+08
dataframe = [dfprepro]
for dataset in dataframe:
dataset.loc[(dataset["web_corpus_freq"] > 10000) & (dataset["web_corpus_freq"] <= 100000), "web_corpus_freq"] = 4
dataset.loc[(dataset["web_corpus_freq"] > 100000) & (dataset["web_corpus_freq"] <= 1000000), "web_corpus_freq"] = 5
dataset.loc[(dataset["web_corpus_freq"] > 1000000) & (dataset["web_corpus_freq"] <= 10000000), "web_corpus_freq"] = 6
dataset.loc[(dataset["web_corpus_freq"] > 10000000) & (dataset["web_corpus_freq"] <= 100000000), "web_corpus_freq"] = 7
dataset.loc[(dataset["web_corpus_freq"] > 100000000) & (dataset["web_corpus_freq"] <= 1000000000), "web_corpus_freq"] = 8
dataset.loc[dataset["web_corpus_freq"] > 1000000000, "web_corpus_freq"] = 9
dfprepro.head()
| word | length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | web_corpus_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | abattoir | 8 | 4.200 | 2.864 | 4.333 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 5.0 | 4.9230 | (12769.999, 575889.4] |
| 1 | abbey | 5 | 3.125 | 5.781 | 4.667 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 6.0 | 5.6250 | (3597973.4, 5702981.5] |
| 2 | abbreviate | 10 | 3.273 | 5.250 | 5.235 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 5.0 | 3.2315 | (12769.999, 575889.4] |
| 3 | abdicate | 8 | 4.194 | 3.767 | 4.419 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 5.0 | 2.9415 | (12769.999, 575889.4] |
| 4 | abdication | 10 | 3.846 | 3.880 | 4.800 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 5.0 | 2.9315 | (12769.999, 575889.4] |
dfprepro["web_corpus_freq"].plot(kind="hist")
<matplotlib.axes._subplots.AxesSubplot at 0x7f8735866510>
dfprepro['web_corpus_freq'].describe()
count 4682.000000 mean 6.285135 std 0.843987 min 4.000000 25% 6.000000 50% 6.000000 75% 7.000000 max 9.000000 Name: web_corpus_freq, dtype: float64
dfprepro['web_corpus_log'].describe()
count 4682 unique 10 top (68973807.5, 2022459848.0] freq 469 Name: web_corpus_log, dtype: object
dfprepro = dfprepro.drop(["web_corpus_log","word"], axis=1)
#dfprepro.loc[(dfprepro['web_corpus_freq'].isnull() == True), 'web_corpus_freq'] = dfprepro['web_corpus_freq'].mean()
dfprepro.isnull().sum()
dfprepro.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 4.200 | 2.864 | 4.333 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 5.0 | 4.9230 |
| 1 | 5 | 3.125 | 5.781 | 4.667 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 6.0 | 5.6250 |
| 2 | 10 | 3.273 | 5.250 | 5.235 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 5.0 | 3.2315 |
| 3 | 8 | 4.194 | 3.767 | 4.419 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 5.0 | 2.9415 |
| 4 | 10 | 3.846 | 3.880 | 4.800 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 5.0 | 2.9315 |
pca = PCA(n_components=2)
pca.fit(dfprepro)
PCA_df = pd.DataFrame(pca.transform(dfprepro))
pca.transform(dfprepro)
score_pca = pca.transform(dfprepro)
scaler = MinMaxScaler()
Y_pca=PCA_df.values
Y_pca_minmax = scaler.fit_transform(Y_pca)
scaled_dfprepro = pd.DataFrame(data = Y_pca_minmax, columns = PCA_df.columns)
scaled_dfprepro.head()
X=dfprepro.values
X_minmax = scaler.fit_transform(X)
df_xminmax = pd.DataFrame(data = X_minmax, columns = dfprepro.columns)
scaled_dfprepro.head()
| 0 | 1 | |
|---|---|---|
| 0 | 0.567559 | 0.141222 |
| 1 | 0.313091 | 0.396255 |
| 2 | 0.603087 | 0.434351 |
| 3 | 0.573763 | 0.250204 |
| 4 | 0.696536 | 0.292227 |
df_xminmax['word']=df_perc['word']
cols = df_xminmax.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_xminmax=df_xminmax[cols]
df_xminmax.head()
| word | length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | abattoir | 0.428571 | 0.350163 | 0.240777 | 0.372006 | 0.138889 | 0.963317 | 0.591837 | 0.735388 | 0.0 | 0.2 | 0.609099 |
| 1 | abbey | 0.214286 | 0.174510 | 0.623736 | 0.423950 | 0.316893 | 0.688108 | 0.676540 | 0.385698 | 0.0 | 0.4 | 0.746168 |
| 2 | abbreviate | 0.571429 | 0.198693 | 0.554024 | 0.512286 | 0.656463 | 0.751739 | 0.233339 | 0.497572 | 0.0 | 0.2 | 0.278825 |
| 3 | abdicate | 0.428571 | 0.349183 | 0.359328 | 0.385381 | 0.439153 | 0.871697 | 0.596713 | 0.697873 | 0.0 | 0.2 | 0.222201 |
| 4 | abdication | 0.571429 | 0.292320 | 0.374163 | 0.444635 | 0.273998 | 0.901947 | 0.669677 | 0.598057 | 0.0 | 0.2 | 0.220248 |
df_xminmax=df_xminmax.drop(['word'],axis=1)
df_xminmax.describe()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 |
| mean | 0.310597 | 0.428289 | 0.532598 | 0.482728 | 0.684871 | 0.508419 | 0.498718 | 0.519165 | 0.080948 | 0.457027 | 0.554749 |
| std | 0.143302 | 0.179275 | 0.209314 | 0.144739 | 0.174077 | 0.217797 | 0.184810 | 0.152787 | 0.272785 | 0.168797 | 0.266786 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.214286 | 0.292810 | 0.405015 | 0.402488 | 0.578042 | 0.329451 | 0.372584 | 0.436443 | 0.000000 | 0.400000 | 0.313092 |
| 50% | 0.285714 | 0.410784 | 0.559275 | 0.494868 | 0.716364 | 0.514256 | 0.507766 | 0.522693 | 0.000000 | 0.400000 | 0.538709 |
| 75% | 0.428571 | 0.549346 | 0.664041 | 0.569051 | 0.816704 | 0.683762 | 0.633375 | 0.612293 | 0.000000 | 0.600000 | 0.813629 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
PCA_df
| 0 | 1 | |
|---|---|---|
| 0 | 3.120256 | -3.279911 |
| 1 | -0.691649 | -0.663581 |
| 2 | 3.652448 | -0.272761 |
| 3 | 3.213184 | -2.161888 |
| 4 | 5.052306 | -1.730781 |
| ... | ... | ... |
| 4677 | 2.402470 | -0.047095 |
| 4678 | -2.805922 | -1.236960 |
| 4679 | -1.332992 | 0.901015 |
| 4680 | -3.880212 | 1.497711 |
| 4681 | 1.500057 | 0.195502 |
4682 rows × 2 columns
x = PCA_df[0]
y = PCA_df[1]
#z = PCA_df[2]
print(type (x))
print(type (y))
#print(type (z))
print(type (Y_pca_minmax))
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> <class 'numpy.ndarray'>
print (Y_pca_minmax)
[[0.5675594 0.14122187] [0.31309066 0.39625457] [0.60308653 0.43435069] ... [0.270277 0.54876721] [0.10023392 0.60693153] [0.45940086 0.47999567]]
kmeans = KMeans( n_clusters=3, n_init=10, max_iter=100 )
kmeans.fit(Y_pca_minmax)
KMeans(max_iter=100, n_clusters=3)
kmeans.labels_
array([2, 0, 2, ..., 0, 0, 1], dtype=int32)
print('SSE', kmeans.inertia_)
print('Silhouette', silhouette_score(Y_pca_minmax, kmeans.labels_))
SSE 107.1169618212198 Silhouette 0.36741782484823465
print('Calinski-Harabasz', metrics.calinski_harabasz_score(Y_pca_minmax, kmeans.labels_))
Calinski-Harabasz 3575.890305688165
silhouette_list = []
#for k in range(2, 50):
# kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
# kmeans.fit(Y_pca_minmax)
# silhouette_list.append( silhouette_score(Y_pca_minmax, kmeans.labels_))
#plt.plot(range(2, len(silhouette_list)+2), silhouette_list, marker='*')
#plt.ylabel('Silhouette score')
#plt.show()
kmeans = KMeans(n_clusters=3, n_init=10, max_iter=100)
kmeans.fit(Y_pca_minmax)
KMeans(max_iter=100, n_clusters=3)
centers = kmeans.cluster_centers_
kmeans.labels_
array([2, 1, 2, ..., 1, 1, 0], dtype=int32)
from matplotlib import cm
plt.scatter( Y_pca[:,0], Y_pca[:,1], s=40, c= kmeans.labels_ , cmap = cm.tab20c)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
plt.scatter( Y_pca_minmax[:,0], Y_pca_minmax[:,1], s=40, c= kmeans.labels_ , cmap = cm.tab20c)
plt.scatter( centers[:,0], centers[:,1], c='red', marker='*', s=200 )
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
scaled_dfprepro['cluster'] = kmeans.labels_
df_xminmax['cluster'] = kmeans.labels_
df_xminmax['cluster']=df_xminmax['cluster'].map({0:'1', 1:'2', 2:'3'})
df_xminmax_plot=df_xminmax.drop(["polysemy"], axis=1)
plt.figure(figsize = (8,8))
pie = scaled_dfprepro.groupby(['cluster']).size().to_frame().reset_index()
pie.rename(columns={0: 'count'}, inplace=True)
pie_labels = ['first cluster', 'second cluster', 'third cluster']
plt.pie(pie['count'], labels=pie_labels)
plt.show()
sb.set(font_scale=3.7)
fig, axs = plt.subplots(ncols=2, nrows=6, figsize=(30,60))
for cols, x in zip(df_xminmax_plot.columns[:5], range(5)):
for y in range(1):
g1=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3'], ax=axs[x,y], showfliers = True )
g1.set(xlabel=None)
for cols, x in zip(df_xminmax_plot.columns[5:], range(5)):
for y in range(1,2):
g2=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3'], ax=axs[x,y], showfliers = True )
g2.set(xlabel=None)
df_perc.isnull().sum()
df_xminmax_plot['word']=df_perc['word']
df_xminmax_plot.loc[df_xminmax['cluster']=='First']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Second']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Third']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(Y_pca_minmax)
distances, indices = neighbors_fit.kneighbors(Y_pca_minmax)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x7f8731efc310>]
fig = plt.figure(figsize=(20, 12))
fig.subplots_adjust(hspace=.5, wspace=.2)
i = 1
for x in range(0, 15, 1):
#y=-2+x/10
eps = 0.012+1*x/1000
db = DBSCAN(eps=eps, min_samples=20).fit(Y_pca_minmax)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
#print(eps)
ax = fig.add_subplot(3, 5, i)
ax.title.set_text("eps = {}".format(round(eps, 3)))
#ax.text(1, 4, "eps = {}".format(round(eps, 3)), fontsize=25, ha="center")
sb.scatterplot(Y_pca_minmax[:,0], Y_pca_minmax[:,1], hue=["{} cluster".format(x) for x in labels],legend=False,palette='tab20c',markers='.',size=1)
ax.text(0.5,-0.3, "{} cluster(s)".format(max(labels)+1), size=12, ha="center", transform=ax.transAxes)
i += 1
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
dbscan = DBSCAN( eps=0.018, min_samples=20)
dbscan.fit(Y_pca_minmax)
DBSCAN(eps=0.018, min_samples=20)
dbscan.labels_
array([-1, 0, -1, ..., 0, -1, 5])
unique_labels, unique_counts = np.unique(dbscan.labels_, return_counts=True)
print(max(unique_labels))
10
#cols = [col for col in PCA_df.columns if col != 'class']
plt.scatter( Y_pca_minmax[:,0], Y_pca_minmax[:,1], s=25, c= dbscan.labels_, cmap = cm.tab20 )
plt.xticks(fontsize=2)
plt.yticks(fontsize=2)
plt.show()
scaled_dfprepro['cluster'] = dbscan.labels_
df_xminmax['cluster'] = dbscan.labels_
df_xminmax['cluster']=df_xminmax['cluster'].map({0:'First', 1:'Second', 2:'Third',3:'Fourth', 4:'Fifth', 5:'Sixth',6:'Seventh', 7:'Eighth', 8:'Nineth',9:'Tenth', 10:'Eleventh', -1:'Noise'})
scaled_dfprepro['cluster'] = dbscan.labels_
df_xminmax['cluster'] = dbscan.labels_
df_xminmax['cluster']=df_xminmax['cluster'].map({-1:'0',0:'1', 1:'2', 2:'3',3:'4',4:'5', 5:'6',6:'7', 7:'8', 8:'9',9:'10', 10:'11'})
df_xminmax_plot=df_xminmax.drop(["polysemy"], axis=1)
df_xminmax_plot = df_xminmax_plot.loc[df_xminmax["cluster"] != -1]
plt.figure(figsize = (8,8))
pie = df_xminmax_plot.groupby(['cluster']).size().to_frame().reset_index()
pie.rename(columns={0: 'count'}, inplace=True)
pie_labels = ['noise','first cluster', 'second cluster', 'third cluster','fourth cluster','fifth cluster','sixth cluster','seventh cluster','eighth cluster','nineth cluster','tenth cluster','eleventh cluster']
plt.pie(pie['count'], labels=pie_labels)
plt.show()
fig, axs = plt.subplots(ncols=2, nrows=6, figsize=(15,30))
for cols, x in zip(df_xminmax_plot.columns[:5], range(5)):
for y in range(1):
sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4','5','6','7','8','9','10','11'], ax=axs[x,y], showfliers = True )
for cols, x in zip(df_xminmax_plot.columns[5:], range(5)):
for y in range(1,2):
sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4','5','6','7','8','9','10','11'], ax=axs[x,y], showfliers = True )
print('Silhouette', silhouette_score(Y_pca_minmax, dbscan.labels_))
Silhouette -0.34242476647615017
print('Calinski-Harabasz', metrics.calinski_harabasz_score(Y_pca_minmax, dbscan.labels_))
Calinski-Harabasz 59.9496266793775
df_perc.isnull().sum()
df_xminmax_plot['word']=df_perc['word']
df_xminmax_plot.loc[df_xminmax['cluster']=='First']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Second']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Third']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Fourth']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Fifth']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
data_dist = pdist(Y_pca_minmax, metric='euclidean')
data_link = linkage(data_dist, method='complete')
res = dendrogram(data_link, truncate_mode='lastp')
hier = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='complete')
hier.fit(Y_pca_minmax)
hier.labels_
hier.n_clusters_
#hier.n_leaves_
plt.scatter( Y_pca_minmax[:,0], Y_pca_minmax[:,1],s=50, c= hier.labels_, marker='.', cmap = cm.tab20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
#Store the labels
labels = hier.labels_
#Then get the frequency count of the non-negative labels
counts = np.bincount(labels[labels>=0])
print (counts)
[2205 310 355 393 732 687]
print('Silhouette', silhouette_score(Y_pca_minmax, hier.labels_))
Silhouette 0.21403647105170687
print('Calinski-Harabasz', metrics.calinski_harabasz_score(Y_pca_minmax, hier.labels_))
Calinski-Harabasz 2074.84678178521
scaled_dfprepro['cluster'] = hier.labels_
df_xminmax['cluster'] = hier.labels_
df_xminmax['cluster']=df_xminmax['cluster'].map({0:'1', 1:'2', 2:'3',3:'4', 4:'5', 5:'6'})
df_xminmax_plot=df_xminmax.drop(["polysemy"], axis=1)
plt.figure(figsize = (8,8))
pie = scaled_dfprepro.groupby(['cluster']).size().to_frame().reset_index()
pie.rename(columns={0: 'count'}, inplace=True)
pie_labels = ['first cluster', 'second cluster', 'third cluster','fourth cluster', 'fifth cluster', 'sixth cluster']
plt.pie(pie['count'], labels=pie_labels)
plt.show()
sb.set(font_scale=3.7)
fig, axs = plt.subplots(ncols=2, nrows=6, figsize=(30,60))
for cols, x in zip(df_xminmax_plot.columns[:5], range(5)):
for y in range(1):
g1=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4', '5','6'], ax=axs[x,y], showfliers = True )
g1.set(xlabel=None)
for cols, x in zip(df_xminmax_plot.columns[5:], range(5)):
for y in range(1,2):
g2=sb.boxplot(y=cols, x='cluster', data=df_xminmax_plot, order=['1', '2','3','4', '5','6'], ax=axs[x,y], showfliers = True )
g2.set(xlabel=None)
df_perc.isnull().sum()
df_xminmax_plot['word']=df_perc['word']
df_xminmax_plot.loc[df_xminmax['cluster']=='First']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Second']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Third']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Fourth']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
df_xminmax_plot.loc[df_xminmax['cluster']=='Fifth']
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | cluster | word |
|---|
# non serve ora
#hier = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
#hier.fit(Y_pca_minmax)
#hier.labels_
#fig = plt.figure()
#ax = fig.add_subplot(111, projection='3d')
#ax.scatter(Y_pca_minmax[:,0], Y_pca_minmax[:,1], Y_pca_minmax[:,2], s=5, c= hier.labels_, marker='.')
#ax.plot3D(Yemo[:,1], Yemo[:,2], Yemo[:,3], c= hier.labels_, marker='.')
#plt.scatter( Yemo[:,1], Yemo[:,2], s=50, c= hier.labels_, marker='.' )
#plt.show()
# per il decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
# visualizzarlo
from sklearn import tree
import pydotplus
from IPython.display import Image
# evaluazione
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# cross-validation
from sklearn.model_selection import cross_val_score
df_class= dfprepro.copy()
df_class.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 4.200 | 2.864 | 4.333 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 5.0 | 4.9230 |
| 1 | 5 | 3.125 | 5.781 | 4.667 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 6.0 | 5.6250 |
| 2 | 10 | 3.273 | 5.250 | 5.235 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 5.0 | 3.2315 |
| 3 | 8 | 4.194 | 3.767 | 4.419 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 5.0 | 2.9415 |
| 4 | 10 | 3.846 | 3.880 | 4.800 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 5.0 | 2.9315 |
attributes = [col for col in df_class.columns if col != 'polysemy']
X = df_class[attributes].values
y = df_class['polysemy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=100)
len(df_class), X_train.shape[0], X_test.shape[0]
(4682, 3277, 1405)
X_train.shape, X_test.shape
((3277, 10), (1405, 10))
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1)
clf.fit(X_train, y_train)
DecisionTreeClassifier()
for col, imp in zip(attributes, clf.feature_importances_): print(col, imp)
length 0.07921529222712845 arousal 0.10707368272885229 valence 0.0955854527681356 dominance 0.12345386186225976 familiarity 0.1131051588586401 aoa 0.10922341363043479 semsize 0.10343533635767627 masculinity 0.10334263776768145 web_corpus_freq 0.024459922367955262 perceivability 0.1411052414312361
dot_data = tree.export_graphviz (clf, out_file=None,
feature_names=attributes,
class_names=[str(v) for v in clf.classes_],
filled=True, rounded=True,
special_characters=True,
max_depth=2)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# apply decision tree to train set
y_pred = clf.predict(X_train)
y_pred[:5]
array([0, 1, 0, 0, 0])
y_train.values[:5]
array([0, 1, 0, 0, 0])
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average=None))
Accuracy 1.0 F1 [1. 1.]
print( classification_report(y_train, y_pred) )
precision recall f1-score support
0 1.00 1.00 1.00 3012
1 1.00 1.00 1.00 265
accuracy 1.00 3277
macro avg 1.00 1.00 1.00 3277
weighted avg 1.00 1.00 1.00 3277
# Confusion matrix for trainset
# TP, FN, FP, TN
confusion_matrix(y_train, y_pred)
array([[3012, 0],
[ 0, 265]])
# apply decision tree to test set
y_pred = clf.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)
Accuracy 0.8597864768683274
F1-score [0.92325672 0.18930041]
precision recall f1-score support
0 0.93 0.92 0.92 1291
1 0.18 0.20 0.19 114
accuracy 0.86 1405
macro avg 0.55 0.56 0.56 1405
weighted avg 0.87 0.86 0.86 1405
array([[1185, 106],
[ 91, 23]])
y_score = clf.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
0.5598237460420998
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
dfprepro.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 4.200 | 2.864 | 4.333 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 5.0 | 4.9230 |
| 1 | 5 | 3.125 | 5.781 | 4.667 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 6.0 | 5.6250 |
| 2 | 10 | 3.273 | 5.250 | 5.235 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 5.0 | 3.2315 |
| 3 | 8 | 4.194 | 3.767 | 4.419 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 5.0 | 2.9415 |
| 4 | 10 | 3.846 | 3.880 | 4.800 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 5.0 | 2.9315 |
df_class_ref = dfprepro.copy()
#dataframe = [df_class_ref]
#for dataset in dataframe:
# dataset.loc[(dataset["aoa"] > 1) & (dataset["aoa"] <= 2), "aoa"] = 1
# dataset.loc[(dataset["aoa"] > 2)& (dataset["aoa"] <= 3), "aoa"] = 2
# dataset.loc[(dataset["aoa"] > 3)& (dataset["aoa"] <= 4), "aoa"] = 3
# dataset.loc[(dataset["aoa"] > 4)& (dataset["aoa"] <= 5), "aoa"] = 4
# dataset.loc[(dataset["aoa"] > 5)& (dataset["aoa"] <= 6), "aoa"] = 5
# dataset.loc[(dataset["aoa"] > 6)&( dataset["aoa"] <= 7), "aoa"] = 6
# dataset.loc[(dataset["aoa"] > 7), "aoa"] = 7
#df_class_ref.head()
var_to_scale=['aoa',"arousal","valence","dominance","familiarity","semsize","masculinity","perceivability"]
features = df_class_ref[var_to_scale]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)
df_class_ref[var_to_scale] = features
df_class_ref.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 0.350163 | 0.240777 | 0.372006 | 0.138889 | 0.963317 | 0.591837 | 0.735388 | 0 | 5.0 | 0.609099 |
| 1 | 5 | 0.174510 | 0.623736 | 0.423950 | 0.316893 | 0.688108 | 0.676540 | 0.385698 | 0 | 6.0 | 0.746168 |
| 2 | 10 | 0.198693 | 0.554024 | 0.512286 | 0.656463 | 0.751739 | 0.233339 | 0.497572 | 0 | 5.0 | 0.278825 |
| 3 | 8 | 0.349183 | 0.359328 | 0.385381 | 0.439153 | 0.871697 | 0.596713 | 0.697873 | 0 | 5.0 | 0.222201 |
| 4 | 10 | 0.292320 | 0.374163 | 0.444635 | 0.273998 | 0.901947 | 0.669677 | 0.598057 | 0 | 5.0 | 0.220248 |
refvar="arousal"
taglio=0.55
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt,
filled=True,
rounded=True,
class_names=["not aroused","aroused"],
feature_names=X.columns)
[Text(534.7704919477663, 399.546, 'valence <= 0.698\ngini = 0.372\nsamples = 3511\nvalue = [2645, 866]\nclass = not aroused'), Text(355.33135788018956, 383.238, 'semsize <= 0.638\ngini = 0.25\nsamples = 2829\nvalue = [2414, 415]\nclass = not aroused'), Text(185.56261986687727, 366.93, 'semsize <= 0.496\ngini = 0.164\nsamples = 2299\nvalue = [2092, 207]\nclass = not aroused'), Text(95.9751029444946, 350.62199999999996, 'masculinity <= 0.369\ngini = 0.101\nsamples = 1542\nvalue = [1460, 82]\nclass = not aroused'), Text(29.20938628158845, 334.31399999999996, 'semsize <= 0.366\ngini = 0.25\nsamples = 171\nvalue = [146, 25]\nclass = not aroused'), Text(16.115523465703973, 318.006, 'valence <= 0.654\ngini = 0.142\nsamples = 117\nvalue = [108, 9]\nclass = not aroused'), Text(8.057761732851986, 301.698, 'web_corpus_freq <= 5.5\ngini = 0.063\nsamples = 92\nvalue = [89, 3]\nclass = not aroused'), Text(4.028880866425993, 285.39, 'aoa <= 0.251\ngini = 0.231\nsamples = 15\nvalue = [13, 2]\nclass = not aroused'), Text(2.0144404332129966, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(6.04332129963899, 269.082, 'dominance <= 0.498\ngini = 0.133\nsamples = 14\nvalue = [13, 1]\nclass = not aroused'), Text(4.028880866425993, 252.774, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'), Text(8.057761732851986, 252.774, 'semsize <= 0.255\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'), Text(6.04332129963899, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(10.072202166064983, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(12.08664259927798, 285.39, 'valence <= 0.636\ngini = 0.026\nsamples = 77\nvalue = [76, 1]\nclass = not aroused'), Text(10.072202166064983, 269.082, 'gini = 0.0\nsamples = 65\nvalue = [65, 0]\nclass = not aroused'), Text(14.101083032490976, 269.082, 'valence <= 0.637\ngini = 0.153\nsamples = 12\nvalue = [11, 1]\nclass = not aroused'), Text(12.08664259927798, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(16.115523465703973, 252.774, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]\nclass = not aroused'), Text(24.17328519855596, 301.698, 'semsize <= 0.142\ngini = 0.365\nsamples = 25\nvalue = [19, 6]\nclass = not aroused'), Text(22.158844765342963, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(26.187725631768956, 285.39, 'masculinity <= 0.216\ngini = 0.287\nsamples = 23\nvalue = [19, 4]\nclass = not aroused'), Text(22.158844765342963, 269.082, 'dominance <= 0.527\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = aroused'), Text(20.144404332129966, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(24.17328519855596, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(30.21660649819495, 269.082, 'familiarity <= 0.565\ngini = 0.105\nsamples = 18\nvalue = [17, 1]\nclass = not aroused'), Text(28.202166064981952, 252.774, 'familiarity <= 0.554\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(26.187725631768956, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(30.21660649819495, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(32.231046931407946, 252.774, 'gini = 0.0\nsamples = 15\nvalue = [15, 0]\nclass = not aroused'), Text(42.30324909747293, 318.006, 'dominance <= 0.256\ngini = 0.417\nsamples = 54\nvalue = [38, 16]\nclass = not aroused'), Text(40.28880866425993, 301.698, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(44.317689530685925, 301.698, 'dominance <= 0.581\ngini = 0.365\nsamples = 50\nvalue = [38, 12]\nclass = not aroused'), Text(40.28880866425993, 285.39, 'aoa <= 0.67\ngini = 0.298\nsamples = 44\nvalue = [36, 8]\nclass = not aroused'), Text(38.274368231046935, 269.082, 'aoa <= 0.304\ngini = 0.383\nsamples = 31\nvalue = [23, 8]\nclass = not aroused'), Text(36.25992779783394, 252.774, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(40.28880866425993, 252.774, 'masculinity <= 0.352\ngini = 0.463\nsamples = 22\nvalue = [14, 8]\nclass = not aroused'), Text(38.274368231046935, 236.46599999999998, 'aoa <= 0.433\ngini = 0.42\nsamples = 20\nvalue = [14, 6]\nclass = not aroused'), Text(34.24548736462094, 220.158, 'perceivability <= 0.658\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'), Text(32.231046931407946, 203.85, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(36.25992779783394, 203.85, 'familiarity <= 0.821\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(34.24548736462094, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(38.274368231046935, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(42.30324909747293, 220.158, 'familiarity <= 0.706\ngini = 0.245\nsamples = 14\nvalue = [12, 2]\nclass = not aroused'), Text(40.28880866425993, 203.85, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'), Text(44.317689530685925, 203.85, 'perceivability <= 0.49\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'), Text(42.30324909747293, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(46.33212996389892, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(42.30324909747293, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(42.30324909747293, 269.082, 'gini = 0.0\nsamples = 13\nvalue = [13, 0]\nclass = not aroused'), Text(48.34657039711192, 285.39, 'dominance <= 0.629\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'), Text(46.33212996389892, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(50.361010830324915, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(162.74081960740074, 334.31399999999996, 'masculinity <= 0.696\ngini = 0.08\nsamples = 1371\nvalue = [1314, 57]\nclass = not aroused'), Text(134.61340816787006, 318.006, 'valence <= 0.626\ngini = 0.065\nsamples = 1246\nvalue = [1204, 42]\nclass = not aroused'), Text(109.83421705776175, 301.698, 'aoa <= 0.971\ngini = 0.051\nsamples = 1116\nvalue = [1087, 29]\nclass = not aroused'), Text(95.21378610108304, 285.39, 'valence <= 0.364\ngini = 0.049\nsamples = 1114\nvalue = [1086, 28]\nclass = not aroused'), Text(72.01624548736463, 269.082, 'perceivability <= 0.894\ngini = 0.1\nsamples = 245\nvalue = [232, 13]\nclass = not aroused'), Text(65.46931407942239, 252.774, 'dominance <= 0.58\ngini = 0.066\nsamples = 233\nvalue = [225, 8]\nclass = not aroused'), Text(63.45487364620939, 236.46599999999998, 'aoa <= 0.458\ngini = 0.059\nsamples = 232\nvalue = [225, 7]\nclass = not aroused'), Text(56.404332129963905, 220.158, 'aoa <= 0.45\ngini = 0.139\nsamples = 80\nvalue = [74, 6]\nclass = not aroused'), Text(52.37545126353791, 203.85, 'familiarity <= 0.408\ngini = 0.098\nsamples = 77\nvalue = [73, 4]\nclass = not aroused'), Text(50.361010830324915, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(54.38989169675091, 187.542, 'dominance <= 0.522\ngini = 0.076\nsamples = 76\nvalue = [73, 3]\nclass = not aroused'), Text(49.35379061371842, 171.23399999999998, 'perceivability <= 0.861\ngini = 0.053\nsamples = 73\nvalue = [71, 2]\nclass = not aroused'), Text(45.32490974729242, 154.926, 'aoa <= 0.24\ngini = 0.029\nsamples = 68\nvalue = [67, 1]\nclass = not aroused'), Text(43.31046931407943, 138.618, 'perceivability <= 0.356\ngini = 0.124\nsamples = 15\nvalue = [14, 1]\nclass = not aroused'), Text(41.29602888086643, 122.31, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(45.32490974729242, 122.31, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'), Text(47.33935018050542, 138.618, 'gini = 0.0\nsamples = 53\nvalue = [53, 0]\nclass = not aroused'), Text(53.38267148014441, 154.926, 'masculinity <= 0.65\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'), Text(51.36823104693141, 138.618, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(55.397111913357406, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(59.4259927797834, 171.23399999999998, 'semsize <= 0.39\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(57.4115523465704, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(61.440433212996396, 154.926, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(60.4332129963899, 203.85, 'length <= 5.0\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(58.4187725631769, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(62.447653429602894, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(70.50541516245488, 220.158, 'masculinity <= 0.42\ngini = 0.013\nsamples = 152\nvalue = [151, 1]\nclass = not aroused'), Text(68.49097472924188, 203.85, 'masculinity <= 0.412\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'), Text(66.47653429602889, 187.542, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(70.50541516245488, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(72.51985559566788, 203.85, 'gini = 0.0\nsamples = 142\nvalue = [142, 0]\nclass = not aroused'), Text(67.48375451263539, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(78.56317689530687, 252.774, 'semsize <= 0.182\ngini = 0.486\nsamples = 12\nvalue = [7, 5]\nclass = not aroused'), Text(76.54873646209387, 236.46599999999998, 'dominance <= 0.337\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'), Text(74.53429602888087, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(78.56317689530687, 220.158, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'), Text(80.57761732851986, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(118.41132671480146, 269.082, 'familiarity <= 0.858\ngini = 0.034\nsamples = 869\nvalue = [854, 15]\nclass = not aroused'), Text(103.86958483754513, 252.774, 'masculinity <= 0.487\ngini = 0.023\nsamples = 760\nvalue = [751, 9]\nclass = not aroused'), Text(90.90162454873646, 236.46599999999998, 'masculinity <= 0.486\ngini = 0.053\nsamples = 222\nvalue = [216, 6]\nclass = not aroused'), Text(84.10288808664261, 220.158, 'dominance <= 0.514\ngini = 0.045\nsamples = 219\nvalue = [214, 5]\nclass = not aroused'), Text(76.54873646209387, 203.85, 'masculinity <= 0.483\ngini = 0.013\nsamples = 156\nvalue = [155, 1]\nclass = not aroused'), Text(74.53429602888087, 187.542, 'gini = 0.0\nsamples = 146\nvalue = [146, 0]\nclass = not aroused'), Text(78.56317689530687, 187.542, 'masculinity <= 0.483\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'), Text(76.54873646209387, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(80.57761732851986, 171.23399999999998, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(91.65703971119135, 203.85, 'dominance <= 0.516\ngini = 0.119\nsamples = 63\nvalue = [59, 4]\nclass = not aroused'), Text(86.62093862815885, 187.542, 'web_corpus_freq <= 7.0\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(84.60649819494586, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(88.63537906137185, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(96.69314079422384, 187.542, 'masculinity <= 0.38\ngini = 0.064\nsamples = 60\nvalue = [58, 2]\nclass = not aroused'), Text(92.66425992779784, 171.23399999999998, 'semsize <= 0.277\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'), Text(90.64981949458485, 154.926, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(94.67870036101084, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(100.72202166064983, 171.23399999999998, 'semsize <= 0.48\ngini = 0.035\nsamples = 56\nvalue = [55, 1]\nclass = not aroused'), Text(98.70758122743683, 154.926, 'gini = 0.0\nsamples = 51\nvalue = [51, 0]\nclass = not aroused'), Text(102.73646209386283, 154.926, 'perceivability <= 0.592\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'), Text(100.72202166064983, 138.618, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(104.75090252707582, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(97.70036101083033, 220.158, 'length <= 8.0\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(95.68592057761734, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(99.71480144404333, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(116.8375451263538, 236.46599999999998, 'aoa <= 0.882\ngini = 0.011\nsamples = 538\nvalue = [535, 3]\nclass = not aroused'), Text(110.79422382671481, 220.158, 'masculinity <= 0.665\ngini = 0.008\nsamples = 525\nvalue = [523, 2]\nclass = not aroused'), Text(104.75090252707582, 203.85, 'familiarity <= 0.829\ngini = 0.004\nsamples = 494\nvalue = [493, 1]\nclass = not aroused'), Text(102.73646209386283, 187.542, 'gini = 0.0\nsamples = 453\nvalue = [453, 0]\nclass = not aroused'), Text(106.76534296028882, 187.542, 'familiarity <= 0.831\ngini = 0.048\nsamples = 41\nvalue = [40, 1]\nclass = not aroused'), Text(104.75090252707582, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(108.77978339350182, 171.23399999999998, 'gini = 0.0\nsamples = 40\nvalue = [40, 0]\nclass = not aroused'), Text(116.8375451263538, 203.85, 'masculinity <= 0.667\ngini = 0.062\nsamples = 31\nvalue = [30, 1]\nclass = not aroused'), Text(114.8231046931408, 187.542, 'aoa <= 0.66\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(112.80866425992781, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(116.8375451263538, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(118.8519855595668, 187.542, 'gini = 0.0\nsamples = 28\nvalue = [28, 0]\nclass = not aroused'), Text(122.88086642599279, 220.158, 'aoa <= 0.884\ngini = 0.142\nsamples = 13\nvalue = [12, 1]\nclass = not aroused'), Text(120.8664259927798, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(124.89530685920579, 203.85, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]\nclass = not aroused'), Text(132.95306859205778, 252.774, 'familiarity <= 0.859\ngini = 0.104\nsamples = 109\nvalue = [103, 6]\nclass = not aroused'), Text(130.93862815884478, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(134.96750902527077, 236.46599999999998, 'aoa <= 0.627\ngini = 0.088\nsamples = 108\nvalue = [103, 5]\nclass = not aroused'), Text(130.93862815884478, 220.158, 'dominance <= 0.558\ngini = 0.056\nsamples = 104\nvalue = [101, 3]\nclass = not aroused'), Text(128.92418772563178, 203.85, 'gini = 0.0\nsamples = 76\nvalue = [76, 0]\nclass = not aroused'), Text(132.95306859205778, 203.85, 'dominance <= 0.559\ngini = 0.191\nsamples = 28\nvalue = [25, 3]\nclass = not aroused'), Text(130.93862815884478, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(134.96750902527077, 187.542, 'masculinity <= 0.423\ngini = 0.137\nsamples = 27\nvalue = [25, 2]\nclass = not aroused'), Text(130.93862815884478, 171.23399999999998, 'dominance <= 0.584\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(128.92418772563178, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(132.95306859205778, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(138.99638989169677, 171.23399999999998, 'masculinity <= 0.615\ngini = 0.077\nsamples = 25\nvalue = [24, 1]\nclass = not aroused'), Text(136.98194945848377, 154.926, 'gini = 0.0\nsamples = 22\nvalue = [22, 0]\nclass = not aroused'), Text(141.01083032490976, 154.926, 'valence <= 0.594\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(138.99638989169677, 138.618, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(143.02527075812276, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(138.99638989169677, 220.158, 'familiarity <= 0.866\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'), Text(136.98194945848377, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(141.01083032490976, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(124.45464801444045, 285.39, 'perceivability <= 0.756\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(122.44020758122745, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(126.46908844765345, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(159.39259927797835, 301.698, 'aoa <= 0.204\ngini = 0.18\nsamples = 130\nvalue = [117, 13]\nclass = not aroused'), Text(157.37815884476535, 285.39, 'gini = 0.0\nsamples = 30\nvalue = [30, 0]\nclass = not aroused'), Text(161.40703971119135, 285.39, 'dominance <= 0.715\ngini = 0.226\nsamples = 100\nvalue = [87, 13]\nclass = not aroused'), Text(159.39259927797835, 269.082, 'masculinity <= 0.446\ngini = 0.213\nsamples = 99\nvalue = [87, 12]\nclass = not aroused'), Text(157.37815884476535, 252.774, 'gini = 0.0\nsamples = 31\nvalue = [31, 0]\nclass = not aroused'), Text(161.40703971119135, 252.774, 'masculinity <= 0.482\ngini = 0.291\nsamples = 68\nvalue = [56, 12]\nclass = not aroused'), Text(151.08303249097474, 236.46599999999998, 'length <= 5.5\ngini = 0.473\nsamples = 13\nvalue = [8, 5]\nclass = not aroused'), Text(147.05415162454875, 220.158, 'dominance <= 0.51\ngini = 0.346\nsamples = 9\nvalue = [7, 2]\nclass = not aroused'), Text(145.03971119133575, 203.85, 'semsize <= 0.375\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'), Text(143.02527075812276, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(147.05415162454875, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(149.06859205776175, 203.85, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(155.11191335740074, 220.158, 'semsize <= 0.451\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'), Text(153.09747292418774, 203.85, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(157.12635379061373, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(171.73104693140795, 236.46599999999998, 'valence <= 0.628\ngini = 0.222\nsamples = 55\nvalue = [48, 7]\nclass = not aroused'), Text(163.16967509025272, 220.158, 'familiarity <= 0.59\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'), Text(161.15523465703973, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(165.18411552346572, 203.85, 'semsize <= 0.455\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(163.16967509025272, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(167.19855595667872, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(180.2924187725632, 220.158, 'aoa <= 0.245\ngini = 0.18\nsamples = 50\nvalue = [45, 5]\nclass = not aroused'), Text(173.2418772563177, 203.85, 'semsize <= 0.224\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'), Text(171.2274368231047, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(175.2563176895307, 187.542, 'polysemy <= 0.5\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(173.2418772563177, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(177.2707581227437, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(187.34296028880868, 203.85, 'familiarity <= 0.651\ngini = 0.124\nsamples = 45\nvalue = [42, 3]\nclass = not aroused'), Text(183.3140794223827, 187.542, 'perceivability <= 0.787\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'), Text(181.2996389891697, 171.23399999999998, 'familiarity <= 0.546\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(179.2851985559567, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(183.3140794223827, 154.926, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(185.3285198555957, 171.23399999999998, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(191.37184115523468, 187.542, 'familiarity <= 0.828\ngini = 0.051\nsamples = 38\nvalue = [37, 1]\nclass = not aroused'), Text(189.35740072202168, 171.23399999999998, 'gini = 0.0\nsamples = 28\nvalue = [28, 0]\nclass = not aroused'), Text(193.38628158844767, 171.23399999999998, 'familiarity <= 0.831\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'), Text(191.37184115523468, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(195.40072202166067, 154.926, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(163.42148014440434, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(190.86823104693144, 318.006, 'valence <= 0.234\ngini = 0.211\nsamples = 125\nvalue = [110, 15]\nclass = not aroused'), Text(183.3140794223827, 301.698, 'perceivability <= 0.584\ngini = 0.48\nsamples = 10\nvalue = [4, 6]\nclass = aroused'), Text(181.2996389891697, 285.39, 'length <= 4.5\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'), Text(179.2851985559567, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(183.3140794223827, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(185.3285198555957, 285.39, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'), Text(198.42238267148016, 301.698, 'dominance <= 0.608\ngini = 0.144\nsamples = 115\nvalue = [106, 9]\nclass = not aroused'), Text(189.35740072202168, 285.39, 'masculinity <= 0.699\ngini = 0.076\nsamples = 101\nvalue = [97, 4]\nclass = not aroused'), Text(187.34296028880868, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(191.37184115523468, 269.082, 'familiarity <= 0.141\ngini = 0.058\nsamples = 100\nvalue = [97, 3]\nclass = not aroused'), Text(187.34296028880868, 252.774, 'familiarity <= 0.098\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(185.3285198555957, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(189.35740072202168, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(195.40072202166067, 252.774, 'perceivability <= 0.895\ngini = 0.04\nsamples = 98\nvalue = [96, 2]\nclass = not aroused'), Text(193.38628158844767, 236.46599999999998, 'gini = 0.0\nsamples = 62\nvalue = [62, 0]\nclass = not aroused'), Text(197.41516245487367, 236.46599999999998, 'perceivability <= 0.896\ngini = 0.105\nsamples = 36\nvalue = [34, 2]\nclass = not aroused'), Text(195.40072202166067, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(199.42960288808666, 220.158, 'valence <= 0.329\ngini = 0.056\nsamples = 35\nvalue = [34, 1]\nclass = not aroused'), Text(197.41516245487367, 203.85, 'valence <= 0.293\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(195.40072202166067, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(199.42960288808666, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(201.44404332129966, 203.85, 'gini = 0.0\nsamples = 32\nvalue = [32, 0]\nclass = not aroused'), Text(207.48736462093865, 285.39, 'semsize <= 0.447\ngini = 0.459\nsamples = 14\nvalue = [9, 5]\nclass = not aroused'), Text(205.47292418772565, 269.082, 'aoa <= 0.752\ngini = 0.298\nsamples = 11\nvalue = [9, 2]\nclass = not aroused'), Text(203.45848375451266, 252.774, 'dominance <= 0.622\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'), Text(201.44404332129966, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(205.47292418772565, 236.46599999999998, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(207.48736462093865, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(209.50180505415165, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(275.15013678925993, 350.62199999999996, 'masculinity <= 0.239\ngini = 0.276\nsamples = 757\nvalue = [632, 125]\nclass = not aroused'), Text(229.88227662454875, 334.31399999999996, 'masculinity <= 0.075\ngini = 0.298\nsamples = 11\nvalue = [2, 9]\nclass = aroused'), Text(227.86783619133575, 318.006, 'semsize <= 0.537\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(225.85339575812276, 301.698, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(229.88227662454875, 301.698, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(231.89671705776175, 318.006, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'), Text(320.41799695397117, 334.31399999999996, 'dominance <= 0.603\ngini = 0.263\nsamples = 746\nvalue = [630, 116]\nclass = not aroused'), Text(277.73310582129966, 318.006, 'masculinity <= 0.66\ngini = 0.225\nsamples = 658\nvalue = [573, 85]\nclass = not aroused'), Text(233.91115749097474, 301.698, 'masculinity <= 0.405\ngini = 0.189\nsamples = 538\nvalue = [481, 57]\nclass = not aroused'), Text(215.54512635379064, 285.39, 'dominance <= 0.347\ngini = 0.38\nsamples = 51\nvalue = [38, 13]\nclass = not aroused'), Text(213.53068592057764, 269.082, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]\nclass = not aroused'), Text(217.55956678700363, 269.082, 'valence <= 0.488\ngini = 0.444\nsamples = 39\nvalue = [26, 13]\nclass = not aroused'), Text(211.51624548736464, 252.774, 'perceivability <= 0.281\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'), Text(209.50180505415165, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(213.53068592057764, 236.46599999999998, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(223.60288808664262, 252.774, 'masculinity <= 0.319\ngini = 0.35\nsamples = 31\nvalue = [24, 7]\nclass = not aroused'), Text(217.55956678700363, 236.46599999999998, 'masculinity <= 0.279\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = not aroused'), Text(215.54512635379064, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(219.57400722021663, 220.158, 'familiarity <= 0.712\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'), Text(217.55956678700363, 203.85, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(221.58844765342963, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(229.6462093862816, 236.46599999999998, 'valence <= 0.692\ngini = 0.227\nsamples = 23\nvalue = [20, 3]\nclass = not aroused'), Text(227.63176895306862, 220.158, 'familiarity <= 0.828\ngini = 0.165\nsamples = 22\nvalue = [20, 2]\nclass = not aroused'), Text(225.61732851985562, 203.85, 'gini = 0.0\nsamples = 15\nvalue = [15, 0]\nclass = not aroused'), Text(229.6462093862816, 203.85, 'masculinity <= 0.392\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'), Text(227.63176895306862, 187.542, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(231.6606498194946, 187.542, 'aoa <= 0.113\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(229.6462093862816, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(233.6750902527076, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(231.6606498194946, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(252.27718862815888, 285.39, 'perceivability <= 0.143\ngini = 0.164\nsamples = 487\nvalue = [443, 44]\nclass = not aroused'), Text(242.7400722021661, 269.082, 'semsize <= 0.619\ngini = 0.436\nsamples = 28\nvalue = [19, 9]\nclass = not aroused'), Text(240.7256317689531, 252.774, 'perceivability <= 0.135\ngini = 0.365\nsamples = 25\nvalue = [19, 6]\nclass = not aroused'), Text(238.7111913357401, 236.46599999999998, 'semsize <= 0.568\ngini = 0.287\nsamples = 23\nvalue = [19, 4]\nclass = not aroused'), Text(236.6967509025271, 220.158, 'gini = 0.0\nsamples = 12\nvalue = [12, 0]\nclass = not aroused'), Text(240.7256317689531, 220.158, 'semsize <= 0.579\ngini = 0.463\nsamples = 11\nvalue = [7, 4]\nclass = not aroused'), Text(238.7111913357401, 203.85, 'aoa <= 0.882\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'), Text(236.6967509025271, 187.542, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(240.7256317689531, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(242.7400722021661, 203.85, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'), Text(242.7400722021661, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(244.7545126353791, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(261.81430505415165, 269.082, 'aoa <= 0.051\ngini = 0.141\nsamples = 459\nvalue = [424, 35]\nclass = not aroused'), Text(259.79986462093865, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(263.82874548736464, 252.774, 'dominance <= 0.057\ngini = 0.137\nsamples = 458\nvalue = [424, 34]\nclass = not aroused'), Text(253.31588447653434, 236.46599999999998, 'semsize <= 0.579\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'), Text(251.30144404332134, 220.158, 'familiarity <= 0.566\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(249.28700361010834, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(253.31588447653434, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(255.33032490974733, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(274.34160649819495, 236.46599999999998, 'aoa <= 0.418\ngini = 0.131\nsamples = 453\nvalue = [421, 32]\nclass = not aroused'), Text(259.3592057761733, 220.158, 'aoa <= 0.412\ngini = 0.22\nsamples = 119\nvalue = [104, 15]\nclass = not aroused'), Text(257.34476534296033, 203.85, 'web_corpus_freq <= 6.5\ngini = 0.198\nsamples = 117\nvalue = [104, 13]\nclass = not aroused'), Text(244.7545126353791, 187.542, 'perceivability <= 0.348\ngini = 0.363\nsamples = 42\nvalue = [32, 10]\nclass = not aroused'), Text(237.7039711191336, 171.23399999999998, 'familiarity <= 0.801\ngini = 0.496\nsamples = 11\nvalue = [5, 6]\nclass = aroused'), Text(235.6895306859206, 154.926, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'), Text(239.7184115523466, 154.926, 'perceivability <= 0.335\ngini = 0.278\nsamples = 6\nvalue = [5, 1]\nclass = not aroused'), Text(237.7039711191336, 138.618, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(241.7328519855596, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(251.80505415162457, 171.23399999999998, 'valence <= 0.604\ngini = 0.225\nsamples = 31\nvalue = [27, 4]\nclass = not aroused'), Text(247.77617328519858, 154.926, 'aoa <= 0.209\ngini = 0.087\nsamples = 22\nvalue = [21, 1]\nclass = not aroused'), Text(245.76173285198558, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(249.79061371841158, 138.618, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]\nclass = not aroused'), Text(255.83393501805057, 154.926, 'aoa <= 0.375\ngini = 0.444\nsamples = 9\nvalue = [6, 3]\nclass = not aroused'), Text(253.81949458483757, 138.618, 'valence <= 0.607\ngini = 0.245\nsamples = 7\nvalue = [6, 1]\nclass = not aroused'), Text(251.80505415162457, 122.31, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(255.83393501805057, 122.31, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'), Text(257.84837545126356, 138.618, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(269.93501805054154, 187.542, 'valence <= 0.691\ngini = 0.077\nsamples = 75\nvalue = [72, 3]\nclass = not aroused'), Text(267.92057761732855, 171.23399999999998, 'familiarity <= 0.956\ngini = 0.053\nsamples = 74\nvalue = [72, 2]\nclass = not aroused'), Text(263.89169675090255, 154.926, 'length <= 3.5\ngini = 0.028\nsamples = 71\nvalue = [70, 1]\nclass = not aroused'), Text(261.87725631768956, 138.618, 'perceivability <= 0.244\ngini = 0.245\nsamples = 7\nvalue = [6, 1]\nclass = not aroused'), Text(259.86281588447656, 122.31, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(263.89169675090255, 122.31, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'), Text(265.90613718411555, 138.618, 'gini = 0.0\nsamples = 64\nvalue = [64, 0]\nclass = not aroused'), Text(271.94945848375454, 154.926, 'aoa <= 0.219\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(269.93501805054154, 138.618, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(273.96389891696754, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(271.94945848375454, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(261.3736462093863, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(289.32400722021663, 220.158, 'semsize <= 0.633\ngini = 0.097\nsamples = 334\nvalue = [317, 17]\nclass = not aroused'), Text(285.29512635379064, 203.85, 'familiarity <= 0.456\ngini = 0.088\nsamples = 326\nvalue = [311, 15]\nclass = not aroused'), Text(283.28068592057764, 187.542, 'gini = 0.0\nsamples = 75\nvalue = [75, 0]\nclass = not aroused'), Text(287.30956678700363, 187.542, 'familiarity <= 0.459\ngini = 0.112\nsamples = 251\nvalue = [236, 15]\nclass = not aroused'), Text(285.29512635379064, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(289.32400722021663, 171.23399999999998, 'aoa <= 0.917\ngini = 0.106\nsamples = 250\nvalue = [236, 14]\nclass = not aroused'), Text(283.5324909747293, 154.926, 'perceivability <= 0.207\ngini = 0.099\nsamples = 248\nvalue = [235, 13]\nclass = not aroused'), Text(277.99277978339353, 138.618, 'perceivability <= 0.203\ngini = 0.287\nsamples = 23\nvalue = [19, 4]\nclass = not aroused'), Text(275.97833935018053, 122.31, 'length <= 4.5\ngini = 0.172\nsamples = 21\nvalue = [19, 2]\nclass = not aroused'), Text(273.96389891696754, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(277.99277978339353, 106.00200000000001, 'dominance <= 0.188\ngini = 0.095\nsamples = 20\nvalue = [19, 1]\nclass = not aroused'), Text(275.97833935018053, 89.69400000000002, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(280.0072202166065, 89.69400000000002, 'gini = 0.0\nsamples = 19\nvalue = [19, 0]\nclass = not aroused'), Text(280.0072202166065, 122.31, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(289.07220216606504, 138.618, 'familiarity <= 0.471\ngini = 0.077\nsamples = 225\nvalue = [216, 9]\nclass = not aroused'), Text(284.0361010830325, 122.31, 'dominance <= 0.459\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'), Text(282.0216606498195, 106.00200000000001, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(286.0505415162455, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(294.1083032490975, 122.31, 'dominance <= 0.39\ngini = 0.07\nsamples = 221\nvalue = [213, 8]\nclass = not aroused'), Text(290.0794223826715, 106.00200000000001, 'dominance <= 0.387\ngini = 0.133\nsamples = 98\nvalue = [91, 7]\nclass = not aroused'), Text(288.0649819494585, 89.69400000000002, 'length <= 5.5\ngini = 0.116\nsamples = 97\nvalue = [91, 6]\nclass = not aroused'), Text(284.0361010830325, 73.38599999999997, 'masculinity <= 0.629\ngini = 0.363\nsamples = 21\nvalue = [16, 5]\nclass = not aroused'), Text(282.0216606498195, 57.077999999999975, 'aoa <= 0.494\ngini = 0.266\nsamples = 19\nvalue = [16, 3]\nclass = not aroused'), Text(277.99277978339353, 40.76999999999998, 'valence <= 0.31\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(275.97833935018053, 24.46199999999999, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(280.0072202166065, 24.46199999999999, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(286.0505415162455, 40.76999999999998, 'semsize <= 0.513\ngini = 0.117\nsamples = 16\nvalue = [15, 1]\nclass = not aroused'), Text(284.0361010830325, 24.46199999999999, 'aoa <= 0.759\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(282.0216606498195, 8.153999999999996, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(286.0505415162455, 8.153999999999996, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(288.0649819494585, 24.46199999999999, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'), Text(286.0505415162455, 57.077999999999975, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(292.0938628158845, 73.38599999999997, 'valence <= 0.486\ngini = 0.026\nsamples = 76\nvalue = [75, 1]\nclass = not aroused'), Text(290.0794223826715, 57.077999999999975, 'gini = 0.0\nsamples = 67\nvalue = [67, 0]\nclass = not aroused'), Text(294.1083032490975, 57.077999999999975, 'perceivability <= 0.277\ngini = 0.198\nsamples = 9\nvalue = [8, 1]\nclass = not aroused'), Text(292.0938628158845, 40.76999999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(296.1227436823105, 40.76999999999998, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'), Text(292.0938628158845, 89.69400000000002, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(298.1371841155235, 106.00200000000001, 'familiarity <= 0.868\ngini = 0.016\nsamples = 123\nvalue = [122, 1]\nclass = not aroused'), Text(296.1227436823105, 89.69400000000002, 'gini = 0.0\nsamples = 119\nvalue = [119, 0]\nclass = not aroused'), Text(300.1516245487365, 89.69400000000002, 'dominance <= 0.464\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'), Text(298.1371841155235, 73.38599999999997, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(302.1660649819495, 73.38599999999997, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(295.115523465704, 154.926, 'masculinity <= 0.534\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(293.10108303249103, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(297.129963898917, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(293.3528880866426, 203.85, 'perceivability <= 0.48\ngini = 0.375\nsamples = 8\nvalue = [6, 2]\nclass = not aroused'), Text(291.3384476534296, 187.542, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(295.3673285198556, 187.542, 'perceivability <= 0.674\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(293.3528880866426, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(297.3817689530686, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(321.5550541516246, 301.698, 'aoa <= 0.415\ngini = 0.358\nsamples = 120\nvalue = [92, 28]\nclass = not aroused'), Text(316.5189530685921, 285.39, 'perceivability <= 0.902\ngini = 0.499\nsamples = 19\nvalue = [9, 10]\nclass = aroused'), Text(312.4900722021661, 269.082, 'masculinity <= 0.708\ngini = 0.346\nsamples = 9\nvalue = [2, 7]\nclass = aroused'), Text(310.4756317689531, 252.774, 'masculinity <= 0.67\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(308.4611913357401, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(312.4900722021661, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(314.5045126353791, 252.774, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(320.5478339350181, 269.082, 'masculinity <= 0.77\ngini = 0.42\nsamples = 10\nvalue = [7, 3]\nclass = not aroused'), Text(318.5333935018051, 252.774, 'perceivability <= 0.932\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = aroused'), Text(316.5189530685921, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(320.5478339350181, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(322.5622743682311, 252.774, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(326.5911552346571, 285.39, 'perceivability <= 0.166\ngini = 0.293\nsamples = 101\nvalue = [83, 18]\nclass = not aroused'), Text(324.5767148014441, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(328.6055956678701, 269.082, 'familiarity <= 0.904\ngini = 0.271\nsamples = 99\nvalue = [83, 16]\nclass = not aroused'), Text(326.5911552346571, 252.774, 'semsize <= 0.496\ngini = 0.259\nsamples = 98\nvalue = [83, 15]\nclass = not aroused'), Text(324.5767148014441, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(328.6055956678701, 236.46599999999998, 'valence <= 0.489\ngini = 0.247\nsamples = 97\nvalue = [83, 14]\nclass = not aroused'), Text(321.8068592057762, 220.158, 'masculinity <= 0.877\ngini = 0.316\nsamples = 61\nvalue = [49, 12]\nclass = not aroused'), Text(319.7924187725632, 203.85, 'dominance <= 0.492\ngini = 0.282\nsamples = 59\nvalue = [49, 10]\nclass = not aroused'), Text(312.2382671480145, 187.542, 'perceivability <= 0.757\ngini = 0.159\nsamples = 46\nvalue = [42, 4]\nclass = not aroused'), Text(307.20216606498195, 171.23399999999998, 'aoa <= 0.484\ngini = 0.089\nsamples = 43\nvalue = [41, 2]\nclass = not aroused'), Text(303.17328519855596, 154.926, 'dominance <= 0.432\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'), Text(301.158844765343, 138.618, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(305.187725631769, 138.618, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(311.23104693140795, 154.926, 'valence <= 0.148\ngini = 0.05\nsamples = 39\nvalue = [38, 1]\nclass = not aroused'), Text(309.216606498195, 138.618, 'dominance <= 0.371\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'), Text(307.20216606498195, 122.31, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'), Text(311.23104693140795, 122.31, 'familiarity <= 0.555\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(309.216606498195, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(313.245487364621, 106.00200000000001, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(313.245487364621, 138.618, 'gini = 0.0\nsamples = 29\nvalue = [29, 0]\nclass = not aroused'), Text(317.274368231047, 171.23399999999998, 'perceivability <= 0.933\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(315.25992779783394, 154.926, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(319.28880866425993, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(327.3465703971119, 187.542, 'masculinity <= 0.788\ngini = 0.497\nsamples = 13\nvalue = [7, 6]\nclass = not aroused'), Text(325.332129963899, 171.23399999999998, 'dominance <= 0.58\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'), Text(323.3176895306859, 154.926, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(327.3465703971119, 154.926, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(329.361010830325, 171.23399999999998, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'), Text(323.8212996389892, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(335.4043321299639, 220.158, 'semsize <= 0.611\ngini = 0.105\nsamples = 36\nvalue = [34, 2]\nclass = not aroused'), Text(333.38989169675096, 203.85, 'familiarity <= 0.775\ngini = 0.056\nsamples = 35\nvalue = [34, 1]\nclass = not aroused'), Text(331.3754512635379, 187.542, 'gini = 0.0\nsamples = 31\nvalue = [31, 0]\nclass = not aroused'), Text(335.4043321299639, 187.542, 'aoa <= 0.488\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'), Text(333.38989169675096, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(337.41877256317696, 171.23399999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(337.41877256317696, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(330.6200361010831, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(363.1028880866426, 318.006, 'perceivability <= 0.686\ngini = 0.456\nsamples = 88\nvalue = [57, 31]\nclass = not aroused'), Text(350.5126353790614, 301.698, 'aoa <= 0.876\ngini = 0.399\nsamples = 69\nvalue = [50, 19]\nclass = not aroused'), Text(348.4981949458484, 285.39, 'dominance <= 0.622\ngini = 0.379\nsamples = 67\nvalue = [50, 17]\nclass = not aroused'), Text(341.44765342960295, 269.082, 'dominance <= 0.619\ngini = 0.492\nsamples = 16\nvalue = [9, 7]\nclass = not aroused'), Text(339.4332129963899, 252.774, 'aoa <= 0.402\ngini = 0.375\nsamples = 12\nvalue = [9, 3]\nclass = not aroused'), Text(337.41877256317696, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(341.44765342960295, 236.46599999999998, 'semsize <= 0.513\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'), Text(339.4332129963899, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(343.4620938628159, 220.158, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(343.4620938628159, 252.774, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(355.54873646209387, 269.082, 'semsize <= 0.586\ngini = 0.315\nsamples = 51\nvalue = [41, 10]\nclass = not aroused'), Text(347.4909747292419, 252.774, 'masculinity <= 0.748\ngini = 0.069\nsamples = 28\nvalue = [27, 1]\nclass = not aroused'), Text(345.47653429602894, 236.46599999999998, 'gini = 0.0\nsamples = 26\nvalue = [26, 0]\nclass = not aroused'), Text(349.50541516245494, 236.46599999999998, 'perceivability <= 0.514\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(347.4909747292419, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(351.5198555956679, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(363.60649819494586, 252.774, 'semsize <= 0.608\ngini = 0.476\nsamples = 23\nvalue = [14, 9]\nclass = not aroused'), Text(357.5631768953069, 236.46599999999998, 'masculinity <= 0.543\ngini = 0.463\nsamples = 11\nvalue = [4, 7]\nclass = aroused'), Text(355.54873646209387, 220.158, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(359.57761732851986, 220.158, 'valence <= 0.38\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'), Text(357.5631768953069, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(361.5920577617329, 203.85, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(369.6498194945849, 236.46599999999998, 'masculinity <= 0.741\ngini = 0.278\nsamples = 12\nvalue = [10, 2]\nclass = not aroused'), Text(367.63537906137185, 220.158, 'valence <= 0.695\ngini = 0.165\nsamples = 11\nvalue = [10, 1]\nclass = not aroused'), Text(365.6209386281589, 203.85, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'), Text(369.6498194945849, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(371.66425992779784, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(352.5270758122744, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(375.69314079422384, 301.698, 'web_corpus_freq <= 6.5\ngini = 0.465\nsamples = 19\nvalue = [7, 12]\nclass = aroused'), Text(371.66425992779784, 285.39, 'masculinity <= 0.679\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'), Text(369.6498194945849, 269.082, 'valence <= 0.617\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(367.63537906137185, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(371.66425992779784, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(373.6787003610109, 269.082, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'), Text(379.72202166064983, 285.39, 'aoa <= 0.329\ngini = 0.444\nsamples = 9\nvalue = [6, 3]\nclass = not aroused'), Text(377.7075812274369, 269.082, 'semsize <= 0.597\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'), Text(375.69314079422384, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(379.72202166064983, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(381.7364620938629, 269.082, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(525.1000958935018, 366.93, 'dominance <= 0.608\ngini = 0.477\nsamples = 530\nvalue = [322, 208]\nclass = not aroused'), Text(482.12798962093865, 350.62199999999996, 'valence <= 0.402\ngini = 0.452\nsamples = 460\nvalue = [301, 159]\nclass = not aroused'), Text(440.1552346570397, 334.31399999999996, 'dominance <= 0.33\ngini = 0.499\nsamples = 244\nvalue = [128, 116]\nclass = not aroused'), Text(408.9314079422383, 318.006, 'semsize <= 0.824\ngini = 0.456\nsamples = 131\nvalue = [85, 46]\nclass = not aroused'), Text(406.9169675090253, 301.698, 'aoa <= 0.549\ngini = 0.408\nsamples = 119\nvalue = [85, 34]\nclass = not aroused'), Text(393.82310469314086, 285.39, 'perceivability <= 0.474\ngini = 0.5\nsamples = 41\nvalue = [20, 21]\nclass = aroused'), Text(385.7653429602889, 269.082, 'masculinity <= 0.447\ngini = 0.403\nsamples = 25\nvalue = [7, 18]\nclass = aroused'), Text(383.7509025270758, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(387.7797833935018, 252.774, 'familiarity <= 0.82\ngini = 0.298\nsamples = 22\nvalue = [4, 18]\nclass = aroused'), Text(383.7509025270758, 236.46599999999998, 'aoa <= 0.309\ngini = 0.111\nsamples = 17\nvalue = [1, 16]\nclass = aroused'), Text(381.7364620938629, 220.158, 'aoa <= 0.256\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(379.72202166064983, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(383.7509025270758, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(385.7653429602889, 220.158, 'gini = 0.0\nsamples = 14\nvalue = [0, 14]\nclass = aroused'), Text(391.8086642599278, 236.46599999999998, 'perceivability <= 0.409\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'), Text(389.79422382671487, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(393.82310469314086, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(401.88086642599285, 269.082, 'dominance <= 0.28\ngini = 0.305\nsamples = 16\nvalue = [13, 3]\nclass = not aroused'), Text(397.85198555956686, 252.774, 'perceivability <= 0.86\ngini = 0.142\nsamples = 13\nvalue = [12, 1]\nclass = not aroused'), Text(395.8375451263538, 236.46599999999998, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]\nclass = not aroused'), Text(399.8664259927798, 236.46599999999998, 'masculinity <= 0.656\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(397.85198555956686, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(401.88086642599285, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(405.90974729241884, 252.774, 'length <= 6.0\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(403.8953068592058, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(407.9241877256318, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(420.01083032490976, 285.39, 'web_corpus_freq <= 5.5\ngini = 0.278\nsamples = 78\nvalue = [65, 13]\nclass = not aroused'), Text(417.9963898916968, 269.082, 'gini = 0.0\nsamples = 21\nvalue = [21, 0]\nclass = not aroused'), Text(422.0252707581228, 269.082, 'valence <= 0.163\ngini = 0.352\nsamples = 57\nvalue = [44, 13]\nclass = not aroused'), Text(413.96750902527083, 252.774, 'perceivability <= 0.158\ngini = 0.208\nsamples = 34\nvalue = [30, 4]\nclass = not aroused'), Text(411.9530685920578, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(415.98194945848377, 236.46599999999998, 'valence <= 0.068\ngini = 0.165\nsamples = 33\nvalue = [30, 3]\nclass = not aroused'), Text(411.9530685920578, 220.158, 'semsize <= 0.757\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'), Text(409.93862815884484, 203.85, 'perceivability <= 0.553\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(407.9241877256318, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(411.9530685920578, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(413.96750902527083, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(420.01083032490976, 220.158, 'semsize <= 0.652\ngini = 0.069\nsamples = 28\nvalue = [27, 1]\nclass = not aroused'), Text(417.9963898916968, 203.85, 'length <= 6.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(415.98194945848377, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(420.01083032490976, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(422.0252707581228, 203.85, 'gini = 0.0\nsamples = 25\nvalue = [25, 0]\nclass = not aroused'), Text(430.0830324909748, 252.774, 'dominance <= 0.176\ngini = 0.476\nsamples = 23\nvalue = [14, 9]\nclass = not aroused'), Text(426.0541516245488, 236.46599999999998, 'perceivability <= 0.242\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'), Text(424.03971119133575, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(428.06859205776175, 220.158, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(434.1119133574008, 236.46599999999998, 'familiarity <= 0.367\ngini = 0.32\nsamples = 15\nvalue = [12, 3]\nclass = not aroused'), Text(432.09747292418774, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(436.12635379061373, 220.158, 'valence <= 0.325\ngini = 0.245\nsamples = 14\nvalue = [12, 2]\nclass = not aroused'), Text(432.09747292418774, 203.85, 'masculinity <= 0.538\ngini = 0.153\nsamples = 12\nvalue = [11, 1]\nclass = not aroused'), Text(430.0830324909748, 187.542, 'familiarity <= 0.804\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(428.06859205776175, 171.23399999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(432.09747292418774, 171.23399999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(434.1119133574008, 187.542, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(440.1552346570397, 203.85, 'valence <= 0.338\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(438.1407942238268, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(442.1696750902528, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(410.9458483754513, 301.698, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]\nclass = aroused'), Text(471.3790613718412, 318.006, 'masculinity <= 0.755\ngini = 0.471\nsamples = 113\nvalue = [43, 70]\nclass = aroused'), Text(464.3285198555957, 301.698, 'semsize <= 0.737\ngini = 0.497\nsamples = 80\nvalue = [37, 43]\nclass = aroused'), Text(456.2707581227437, 285.39, 'familiarity <= 0.579\ngini = 0.495\nsamples = 58\nvalue = [32, 26]\nclass = not aroused'), Text(448.2129963898917, 269.082, 'familiarity <= 0.476\ngini = 0.465\nsamples = 19\nvalue = [7, 12]\nclass = aroused'), Text(444.1841155234657, 252.774, 'dominance <= 0.401\ngini = 0.444\nsamples = 9\nvalue = [6, 3]\nclass = not aroused'), Text(442.1696750902528, 236.46599999999998, 'semsize <= 0.715\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'), Text(440.1552346570397, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(444.1841155234657, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(446.1985559566788, 236.46599999999998, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(452.2418772563177, 252.774, 'dominance <= 0.418\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'), Text(450.22743682310477, 236.46599999999998, 'semsize <= 0.677\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(448.2129963898917, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(452.2418772563177, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(454.25631768953076, 236.46599999999998, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = aroused'), Text(464.3285198555957, 269.082, 'valence <= 0.207\ngini = 0.46\nsamples = 39\nvalue = [25, 14]\nclass = not aroused'), Text(460.2996389891697, 252.774, 'aoa <= 0.241\ngini = 0.245\nsamples = 21\nvalue = [18, 3]\nclass = not aroused'), Text(458.28519855595675, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(462.31407942238275, 236.46599999999998, 'perceivability <= 0.769\ngini = 0.18\nsamples = 20\nvalue = [18, 2]\nclass = not aroused'), Text(460.2996389891697, 220.158, 'familiarity <= 0.873\ngini = 0.1\nsamples = 19\nvalue = [18, 1]\nclass = not aroused'), Text(458.28519855595675, 203.85, 'gini = 0.0\nsamples = 17\nvalue = [17, 0]\nclass = not aroused'), Text(462.31407942238275, 203.85, 'perceivability <= 0.584\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(460.2996389891697, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(464.3285198555957, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(464.3285198555957, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(468.3574007220217, 252.774, 'masculinity <= 0.61\ngini = 0.475\nsamples = 18\nvalue = [7, 11]\nclass = aroused'), Text(466.34296028880874, 236.46599999999998, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = aroused'), Text(470.37184115523473, 236.46599999999998, 'dominance <= 0.393\ngini = 0.463\nsamples = 11\nvalue = [7, 4]\nclass = not aroused'), Text(468.3574007220217, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(472.3862815884477, 220.158, 'semsize <= 0.714\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'), Text(470.37184115523473, 203.85, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'), Text(474.4007220216607, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(472.3862815884477, 285.39, 'aoa <= 0.589\ngini = 0.351\nsamples = 22\nvalue = [5, 17]\nclass = aroused'), Text(470.37184115523473, 269.082, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = aroused'), Text(474.4007220216607, 269.082, 'familiarity <= 0.524\ngini = 0.473\nsamples = 13\nvalue = [5, 8]\nclass = aroused'), Text(472.3862815884477, 252.774, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(476.41516245487367, 252.774, 'familiarity <= 0.63\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'), Text(474.4007220216607, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(478.4296028880867, 236.46599999999998, 'masculinity <= 0.541\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(476.41516245487367, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(480.44404332129966, 220.158, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(478.4296028880867, 301.698, 'length <= 4.5\ngini = 0.298\nsamples = 33\nvalue = [6, 27]\nclass = aroused'), Text(476.41516245487367, 285.39, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(480.44404332129966, 285.39, 'masculinity <= 0.822\ngini = 0.18\nsamples = 30\nvalue = [3, 27]\nclass = aroused'), Text(478.4296028880867, 269.082, 'gini = 0.0\nsamples = 24\nvalue = [0, 24]\nclass = aroused'), Text(482.4584837545127, 269.082, 'perceivability <= 0.626\ngini = 0.5\nsamples = 6\nvalue = [3, 3]\nclass = not aroused'), Text(480.44404332129966, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(484.47292418772565, 252.774, 'perceivability <= 0.751\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'), Text(482.4584837545127, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(486.4873646209387, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(524.1007445848376, 334.31399999999996, 'valence <= 0.645\ngini = 0.319\nsamples = 216\nvalue = [173, 43]\nclass = not aroused'), Text(507.3242328519856, 318.006, 'semsize <= 0.887\ngini = 0.246\nsamples = 181\nvalue = [155, 26]\nclass = not aroused'), Text(499.707129963899, 301.698, 'masculinity <= 0.345\ngini = 0.204\nsamples = 173\nvalue = [153, 20]\nclass = not aroused'), Text(490.5162454873647, 285.39, 'semsize <= 0.647\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'), Text(488.50180505415165, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(492.53068592057764, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(508.89801444043326, 285.39, 'perceivability <= 0.225\ngini = 0.172\nsamples = 168\nvalue = [152, 16]\nclass = not aroused'), Text(496.55956678700363, 269.082, 'semsize <= 0.729\ngini = 0.444\nsamples = 18\nvalue = [12, 6]\nclass = not aroused'), Text(492.53068592057764, 252.774, 'dominance <= 0.59\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = not aroused'), Text(490.5162454873647, 236.46599999999998, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = not aroused'), Text(494.5451263537907, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(500.5884476534296, 252.774, 'aoa <= 0.291\ngini = 0.469\nsamples = 8\nvalue = [3, 5]\nclass = aroused'), Text(498.5740072202167, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(502.6028880866427, 236.46599999999998, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'), Text(521.2364620938629, 269.082, 'semsize <= 0.783\ngini = 0.124\nsamples = 150\nvalue = [140, 10]\nclass = not aroused'), Text(513.6823104693142, 252.774, 'semsize <= 0.677\ngini = 0.082\nsamples = 116\nvalue = [111, 5]\nclass = not aroused'), Text(506.6317689530687, 236.46599999999998, 'dominance <= 0.504\ngini = 0.18\nsamples = 40\nvalue = [36, 4]\nclass = not aroused'), Text(502.6028880866427, 220.158, 'perceivability <= 0.937\ngini = 0.064\nsamples = 30\nvalue = [29, 1]\nclass = not aroused'), Text(500.5884476534296, 203.85, 'gini = 0.0\nsamples = 27\nvalue = [27, 0]\nclass = not aroused'), Text(504.6173285198556, 203.85, 'valence <= 0.501\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(502.6028880866427, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(506.6317689530687, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(510.66064981949467, 220.158, 'length <= 6.5\ngini = 0.42\nsamples = 10\nvalue = [7, 3]\nclass = not aroused'), Text(508.6462093862816, 203.85, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'), Text(512.6750902527076, 203.85, 'valence <= 0.628\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = aroused'), Text(510.66064981949467, 187.542, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(514.6895306859207, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(520.7328519855596, 236.46599999999998, 'valence <= 0.468\ngini = 0.026\nsamples = 76\nvalue = [75, 1]\nclass = not aroused'), Text(518.7184115523467, 220.158, 'valence <= 0.46\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'), Text(516.7039711191336, 203.85, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'), Text(520.7328519855596, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(522.7472924187726, 220.158, 'gini = 0.0\nsamples = 68\nvalue = [68, 0]\nclass = not aroused'), Text(528.7906137184116, 252.774, 'semsize <= 0.785\ngini = 0.251\nsamples = 34\nvalue = [29, 5]\nclass = not aroused'), Text(526.7761732851986, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(530.8050541516246, 236.46599999999998, 'length <= 4.5\ngini = 0.17\nsamples = 32\nvalue = [29, 3]\nclass = not aroused'), Text(526.7761732851986, 220.158, 'semsize <= 0.819\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'), Text(524.7617328519856, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(528.7906137184116, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(534.8339350180506, 220.158, 'length <= 9.5\ngini = 0.069\nsamples = 28\nvalue = [27, 1]\nclass = not aroused'), Text(532.8194945848376, 203.85, 'gini = 0.0\nsamples = 24\nvalue = [24, 0]\nclass = not aroused'), Text(536.8483754512636, 203.85, 'semsize <= 0.827\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = not aroused'), Text(534.8339350180506, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(538.8628158844766, 187.542, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(514.9413357400723, 301.698, 'dominance <= 0.452\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'), Text(512.9268953068593, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(516.9557761732852, 285.39, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(540.8772563176896, 318.006, 'familiarity <= 0.873\ngini = 0.5\nsamples = 35\nvalue = [18, 17]\nclass = not aroused'), Text(538.8628158844766, 301.698, 'dominance <= 0.441\ngini = 0.471\nsamples = 29\nvalue = [18, 11]\nclass = not aroused'), Text(532.8194945848376, 285.39, 'length <= 7.5\ngini = 0.346\nsamples = 9\nvalue = [2, 7]\nclass = aroused'), Text(530.8050541516246, 269.082, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(534.8339350180506, 269.082, 'length <= 9.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(532.8194945848376, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(536.8483754512636, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(544.9061371841156, 285.39, 'masculinity <= 0.708\ngini = 0.32\nsamples = 20\nvalue = [16, 4]\nclass = not aroused'), Text(542.8916967509026, 269.082, 'masculinity <= 0.408\ngini = 0.198\nsamples = 18\nvalue = [16, 2]\nclass = not aroused'), Text(540.8772563176896, 252.774, 'valence <= 0.662\ngini = 0.5\nsamples = 4\nvalue = [2, 2]\nclass = not aroused'), Text(538.8628158844766, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(542.8916967509026, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(544.9061371841156, 252.774, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'), Text(546.9205776173286, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(542.8916967509026, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(568.072202166065, 350.62199999999996, 'valence <= 0.687\ngini = 0.42\nsamples = 70\nvalue = [21, 49]\nclass = aroused'), Text(561.0216606498195, 334.31399999999996, 'perceivability <= 0.248\ngini = 0.379\nsamples = 63\nvalue = [16, 47]\nclass = aroused'), Text(556.9927797833935, 318.006, 'aoa <= 0.774\ngini = 0.496\nsamples = 11\nvalue = [6, 5]\nclass = not aroused'), Text(554.9783393501806, 301.698, 'semsize <= 0.677\ngini = 0.408\nsamples = 7\nvalue = [2, 5]\nclass = aroused'), Text(552.9638989169675, 285.39, 'masculinity <= 0.467\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(550.9494584837546, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(554.9783393501806, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(556.9927797833935, 285.39, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(559.0072202166066, 301.698, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(565.0505415162455, 318.006, 'valence <= 0.601\ngini = 0.311\nsamples = 52\nvalue = [10, 42]\nclass = aroused'), Text(563.0361010830326, 301.698, 'gini = 0.0\nsamples = 19\nvalue = [0, 19]\nclass = aroused'), Text(567.0649819494586, 301.698, 'valence <= 0.612\ngini = 0.422\nsamples = 33\nvalue = [10, 23]\nclass = aroused'), Text(565.0505415162455, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(569.0794223826715, 285.39, 'length <= 9.5\ngini = 0.383\nsamples = 31\nvalue = [8, 23]\nclass = aroused'), Text(564.043321299639, 269.082, 'masculinity <= 0.773\ngini = 0.269\nsamples = 25\nvalue = [4, 21]\nclass = aroused'), Text(560.014440433213, 252.774, 'semsize <= 0.658\ngini = 0.165\nsamples = 22\nvalue = [2, 20]\nclass = aroused'), Text(558.0, 236.46599999999998, 'familiarity <= 0.692\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'), Text(555.9855595667871, 220.158, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(560.014440433213, 220.158, 'web_corpus_freq <= 7.0\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(558.0, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(562.0288808664261, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(562.0288808664261, 236.46599999999998, 'gini = 0.0\nsamples = 16\nvalue = [0, 16]\nclass = aroused'), Text(568.072202166065, 252.774, 'perceivability <= 0.869\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(566.057761732852, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(570.0866425992781, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(574.115523465704, 269.082, 'masculinity <= 0.666\ngini = 0.444\nsamples = 6\nvalue = [4, 2]\nclass = not aroused'), Text(572.101083032491, 252.774, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(576.129963898917, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(575.1227436823106, 334.31399999999996, 'masculinity <= 0.469\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'), Text(573.1083032490975, 318.006, 'dominance <= 0.653\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(571.0938628158846, 301.698, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(575.1227436823106, 301.698, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(577.1371841155235, 318.006, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(714.209626015343, 383.238, 'dominance <= 0.585\ngini = 0.448\nsamples = 682\nvalue = [231, 451]\nclass = aroused'), Text(656.2590534747293, 366.93, 'semsize <= 0.725\ngini = 0.494\nsamples = 269\nvalue = [149, 120]\nclass = not aroused'), Text(612.5000564079423, 350.62199999999996, 'masculinity <= 0.189\ngini = 0.471\nsamples = 219\nvalue = [136, 83]\nclass = not aroused'), Text(585.1949458483755, 334.31399999999996, 'masculinity <= 0.108\ngini = 0.482\nsamples = 32\nvalue = [13, 19]\nclass = aroused'), Text(581.1660649819495, 318.006, 'web_corpus_freq <= 6.5\ngini = 0.457\nsamples = 17\nvalue = [11, 6]\nclass = not aroused'), Text(579.1516245487365, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'), Text(583.1805054151625, 301.698, 'aoa <= 0.239\ngini = 0.496\nsamples = 11\nvalue = [5, 6]\nclass = aroused'), Text(581.1660649819495, 285.39, 'familiarity <= 0.772\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'), Text(579.1516245487365, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(583.1805054151625, 269.082, 'masculinity <= 0.026\ngini = 0.278\nsamples = 6\nvalue = [5, 1]\nclass = not aroused'), Text(581.1660649819495, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(585.1949458483755, 252.774, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(585.1949458483755, 285.39, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(589.2238267148015, 318.006, 'familiarity <= 0.619\ngini = 0.231\nsamples = 15\nvalue = [2, 13]\nclass = aroused'), Text(587.2093862815885, 301.698, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(591.2382671480145, 301.698, 'gini = 0.0\nsamples = 13\nvalue = [0, 13]\nclass = aroused'), Text(639.8051669675091, 334.31399999999996, 'aoa <= 0.453\ngini = 0.45\nsamples = 187\nvalue = [123, 64]\nclass = not aroused'), Text(613.0823555956679, 318.006, 'perceivability <= 0.428\ngini = 0.487\nsamples = 105\nvalue = [61, 44]\nclass = not aroused'), Text(595.2671480144405, 301.698, 'masculinity <= 0.405\ngini = 0.337\nsamples = 28\nvalue = [22, 6]\nclass = not aroused'), Text(593.2527075812275, 285.39, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = not aroused'), Text(597.2815884476535, 285.39, 'valence <= 0.712\ngini = 0.5\nsamples = 12\nvalue = [6, 6]\nclass = not aroused'), Text(595.2671480144405, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(599.2960288808665, 269.082, 'masculinity <= 0.468\ngini = 0.444\nsamples = 9\nvalue = [3, 6]\nclass = aroused'), Text(597.2815884476535, 252.774, 'length <= 3.5\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'), Text(595.2671480144405, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(599.2960288808665, 236.46599999999998, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(601.3104693140795, 252.774, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(630.8975631768953, 301.698, 'valence <= 0.782\ngini = 0.5\nsamples = 77\nvalue = [39, 38]\nclass = not aroused'), Text(616.1669675090253, 285.39, 'perceivability <= 0.581\ngini = 0.481\nsamples = 57\nvalue = [34, 23]\nclass = not aroused'), Text(607.3537906137185, 269.082, 'semsize <= 0.493\ngini = 0.346\nsamples = 9\nvalue = [2, 7]\nclass = aroused'), Text(605.3393501805054, 252.774, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(609.3682310469314, 252.774, 'semsize <= 0.55\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(607.3537906137185, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(611.3826714801445, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(624.9801444043322, 269.082, 'masculinity <= 0.323\ngini = 0.444\nsamples = 48\nvalue = [32, 16]\nclass = not aroused'), Text(617.4259927797834, 252.774, 'familiarity <= 0.914\ngini = 0.208\nsamples = 17\nvalue = [15, 2]\nclass = not aroused'), Text(615.4115523465705, 236.46599999999998, 'polysemy <= 0.5\ngini = 0.117\nsamples = 16\nvalue = [15, 1]\nclass = not aroused'), Text(613.3971119133574, 220.158, 'gini = 0.0\nsamples = 14\nvalue = [14, 0]\nclass = not aroused'), Text(617.4259927797834, 220.158, 'familiarity <= 0.863\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(615.4115523465705, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(619.4404332129965, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(619.4404332129965, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(632.5342960288809, 252.774, 'semsize <= 0.329\ngini = 0.495\nsamples = 31\nvalue = [17, 14]\nclass = not aroused'), Text(630.5198555956679, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(634.548736462094, 236.46599999999998, 'semsize <= 0.578\ngini = 0.466\nsamples = 27\nvalue = [17, 10]\nclass = not aroused'), Text(627.4981949458485, 220.158, 'dominance <= 0.578\ngini = 0.32\nsamples = 15\nvalue = [12, 3]\nclass = not aroused'), Text(623.4693140794225, 203.85, 'perceivability <= 0.969\ngini = 0.153\nsamples = 12\nvalue = [11, 1]\nclass = not aroused'), Text(621.4548736462094, 187.542, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]\nclass = not aroused'), Text(625.4837545126354, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(631.5270758122745, 203.85, 'semsize <= 0.422\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = aroused'), Text(629.5126353790614, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(633.5415162454874, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(641.5992779783394, 220.158, 'aoa <= 0.312\ngini = 0.486\nsamples = 12\nvalue = [5, 7]\nclass = aroused'), Text(639.5848375451264, 203.85, 'perceivability <= 0.914\ngini = 0.408\nsamples = 7\nvalue = [5, 2]\nclass = not aroused'), Text(637.5703971119134, 187.542, 'gini = 0.0\nsamples = 5\nvalue = [5, 0]\nclass = not aroused'), Text(641.5992779783394, 187.542, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(643.6137184115524, 203.85, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'), Text(645.6281588447654, 285.39, 'semsize <= 0.444\ngini = 0.375\nsamples = 20\nvalue = [5, 15]\nclass = aroused'), Text(643.6137184115524, 269.082, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = aroused'), Text(647.6425992779784, 269.082, 'familiarity <= 0.885\ngini = 0.473\nsamples = 13\nvalue = [5, 8]\nclass = aroused'), Text(645.6281588447654, 252.774, 'perceivability <= 0.494\ngini = 0.32\nsamples = 10\nvalue = [2, 8]\nclass = aroused'), Text(643.6137184115524, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(647.6425992779784, 236.46599999999998, 'dominance <= 0.57\ngini = 0.198\nsamples = 9\nvalue = [1, 8]\nclass = aroused'), Text(645.6281588447654, 220.158, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'), Text(649.6570397111914, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(649.6570397111914, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(666.5279783393503, 318.006, 'valence <= 0.776\ngini = 0.369\nsamples = 82\nvalue = [62, 20]\nclass = not aroused'), Text(657.7148014440434, 301.698, 'masculinity <= 0.219\ngini = 0.136\nsamples = 41\nvalue = [38, 3]\nclass = not aroused'), Text(653.6859205776174, 285.39, 'web_corpus_freq <= 5.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(651.6714801444044, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(655.7003610108304, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(661.7436823104694, 285.39, 'length <= 5.5\ngini = 0.097\nsamples = 39\nvalue = [37, 2]\nclass = not aroused'), Text(659.7292418772564, 269.082, 'valence <= 0.721\ngini = 0.26\nsamples = 13\nvalue = [11, 2]\nclass = not aroused'), Text(657.7148014440434, 252.774, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'), Text(661.7436823104694, 252.774, 'length <= 4.5\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'), Text(659.7292418772564, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(663.7581227436824, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(663.7581227436824, 269.082, 'gini = 0.0\nsamples = 26\nvalue = [26, 0]\nclass = not aroused'), Text(675.3411552346571, 301.698, 'perceivability <= 0.222\ngini = 0.485\nsamples = 41\nvalue = [24, 17]\nclass = not aroused'), Text(669.8014440433213, 285.39, 'familiarity <= 0.774\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = aroused'), Text(667.7870036101084, 269.082, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'), Text(671.8158844765344, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(680.8808664259928, 285.39, 'perceivability <= 0.296\ngini = 0.451\nsamples = 35\nvalue = [23, 12]\nclass = not aroused'), Text(675.8447653429604, 269.082, 'semsize <= 0.686\ngini = 0.165\nsamples = 11\nvalue = [10, 1]\nclass = not aroused'), Text(673.8303249097473, 252.774, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]\nclass = not aroused'), Text(677.8592057761733, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(685.9169675090253, 269.082, 'dominance <= 0.467\ngini = 0.497\nsamples = 24\nvalue = [13, 11]\nclass = not aroused'), Text(681.8880866425993, 252.774, 'familiarity <= 0.787\ngini = 0.219\nsamples = 8\nvalue = [7, 1]\nclass = not aroused'), Text(679.8736462093864, 236.46599999999998, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = not aroused'), Text(683.9025270758124, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(689.9458483754513, 252.774, 'semsize <= 0.676\ngini = 0.469\nsamples = 16\nvalue = [6, 10]\nclass = aroused'), Text(687.9314079422384, 236.46599999999998, 'aoa <= 0.491\ngini = 0.355\nsamples = 13\nvalue = [3, 10]\nclass = aroused'), Text(683.9025270758124, 220.158, 'web_corpus_freq <= 6.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(681.8880866425993, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(685.9169675090253, 203.85, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(691.9602888086644, 220.158, 'length <= 10.5\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'), Text(689.9458483754513, 203.85, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = aroused'), Text(693.9747292418773, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(691.9602888086644, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(700.0180505415163, 350.62199999999996, 'valence <= 0.84\ngini = 0.385\nsamples = 50\nvalue = [13, 37]\nclass = aroused'), Text(698.0036101083033, 334.31399999999996, 'semsize <= 0.868\ngini = 0.478\nsamples = 33\nvalue = [13, 20]\nclass = aroused'), Text(695.9891696750904, 318.006, 'dominance <= 0.522\ngini = 0.497\nsamples = 24\nvalue = [13, 11]\nclass = not aroused'), Text(689.9458483754513, 301.698, 'familiarity <= 0.817\ngini = 0.355\nsamples = 13\nvalue = [10, 3]\nclass = not aroused'), Text(687.9314079422384, 285.39, 'gini = 0.0\nsamples = 8\nvalue = [8, 0]\nclass = not aroused'), Text(691.9602888086644, 285.39, 'dominance <= 0.5\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = aroused'), Text(689.9458483754513, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(693.9747292418773, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(702.0324909747293, 301.698, 'dominance <= 0.573\ngini = 0.397\nsamples = 11\nvalue = [3, 8]\nclass = aroused'), Text(700.0180505415163, 285.39, 'familiarity <= 0.934\ngini = 0.198\nsamples = 9\nvalue = [1, 8]\nclass = aroused'), Text(698.0036101083033, 269.082, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'), Text(702.0324909747293, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(704.0469314079423, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(700.0180505415163, 318.006, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = aroused'), Text(702.0324909747293, 334.31399999999996, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]\nclass = aroused'), Text(772.1601985559568, 366.93, 'valence <= 0.801\ngini = 0.318\nsamples = 413\nvalue = [82, 331]\nclass = aroused'), Text(734.1376353790614, 350.62199999999996, 'perceivability <= 0.326\ngini = 0.464\nsamples = 150\nvalue = [55, 95]\nclass = aroused'), Text(713.6155234657041, 334.31399999999996, 'perceivability <= 0.146\ngini = 0.469\nsamples = 56\nvalue = [35, 21]\nclass = not aroused'), Text(708.0758122743683, 318.006, 'masculinity <= 0.39\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'), Text(706.0613718411553, 301.698, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(710.0902527075813, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(719.1552346570397, 318.006, 'masculinity <= 0.531\ngini = 0.425\nsamples = 49\nvalue = [34, 15]\nclass = not aroused'), Text(714.1191335740073, 301.698, 'familiarity <= 0.571\ngini = 0.298\nsamples = 33\nvalue = [27, 6]\nclass = not aroused'), Text(712.1046931407943, 285.39, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(716.1335740072203, 285.39, 'polysemy <= 0.5\ngini = 0.18\nsamples = 30\nvalue = [27, 3]\nclass = not aroused'), Text(714.1191335740073, 269.082, 'familiarity <= 0.902\ngini = 0.128\nsamples = 29\nvalue = [27, 2]\nclass = not aroused'), Text(710.0902527075813, 252.774, 'semsize <= 0.518\ngini = 0.071\nsamples = 27\nvalue = [26, 1]\nclass = not aroused'), Text(708.0758122743683, 236.46599999999998, 'perceivability <= 0.223\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'), Text(706.0613718411553, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(710.0902527075813, 220.158, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(712.1046931407943, 236.46599999999998, 'gini = 0.0\nsamples = 22\nvalue = [22, 0]\nclass = not aroused'), Text(718.1480144404333, 252.774, 'perceivability <= 0.202\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(716.1335740072203, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(720.1624548736463, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(718.1480144404333, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(724.1913357400723, 301.698, 'semsize <= 0.54\ngini = 0.492\nsamples = 16\nvalue = [7, 9]\nclass = aroused'), Text(722.1768953068593, 285.39, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(726.2057761732852, 285.39, 'perceivability <= 0.306\ngini = 0.426\nsamples = 13\nvalue = [4, 9]\nclass = aroused'), Text(724.1913357400723, 269.082, 'valence <= 0.704\ngini = 0.298\nsamples = 11\nvalue = [2, 9]\nclass = aroused'), Text(722.1768953068593, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(726.2057761732852, 252.774, 'perceivability <= 0.187\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'), Text(724.1913357400723, 236.46599999999998, 'masculinity <= 0.606\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(722.1768953068593, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(726.2057761732852, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(728.2202166064983, 236.46599999999998, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'), Text(728.2202166064983, 269.082, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(754.6597472924188, 334.31399999999996, 'masculinity <= 0.049\ngini = 0.335\nsamples = 94\nvalue = [20, 74]\nclass = aroused'), Text(752.6453068592059, 318.006, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(756.6741877256319, 318.006, 'length <= 8.5\ngini = 0.315\nsamples = 92\nvalue = [18, 74]\nclass = aroused'), Text(740.8104693140795, 301.698, 'perceivability <= 0.416\ngini = 0.26\nsamples = 78\nvalue = [12, 66]\nclass = aroused'), Text(734.2635379061372, 285.39, 'familiarity <= 0.852\ngini = 0.435\nsamples = 25\nvalue = [8, 17]\nclass = aroused'), Text(732.2490974729243, 269.082, 'masculinity <= 0.525\ngini = 0.308\nsamples = 21\nvalue = [4, 17]\nclass = aroused'), Text(730.2346570397112, 252.774, 'gini = 0.0\nsamples = 11\nvalue = [0, 11]\nclass = aroused'), Text(734.2635379061372, 252.774, 'masculinity <= 0.536\ngini = 0.48\nsamples = 10\nvalue = [4, 6]\nclass = aroused'), Text(732.2490974729243, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(736.2779783393503, 236.46599999999998, 'length <= 4.5\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = aroused'), Text(734.2635379061372, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(738.2924187725632, 220.158, 'familiarity <= 0.301\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = aroused'), Text(736.2779783393503, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(740.3068592057763, 203.85, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(736.2779783393503, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(747.3574007220218, 285.39, 'masculinity <= 0.251\ngini = 0.14\nsamples = 53\nvalue = [4, 49]\nclass = aroused'), Text(740.3068592057763, 269.082, 'masculinity <= 0.19\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = aroused'), Text(738.2924187725632, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(742.3212996389892, 252.774, 'web_corpus_freq <= 6.5\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = not aroused'), Text(740.3068592057763, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(744.3357400722023, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(754.4079422382672, 269.082, 'perceivability <= 0.958\ngini = 0.081\nsamples = 47\nvalue = [2, 45]\nclass = aroused'), Text(750.3790613718412, 252.774, 'masculinity <= 0.383\ngini = 0.043\nsamples = 45\nvalue = [1, 44]\nclass = aroused'), Text(748.3646209386283, 236.46599999999998, 'masculinity <= 0.379\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = aroused'), Text(746.3501805054152, 220.158, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = aroused'), Text(750.3790613718412, 220.158, 'length <= 5.0\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(748.3646209386283, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(752.3935018050543, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(752.3935018050543, 236.46599999999998, 'gini = 0.0\nsamples = 35\nvalue = [0, 35]\nclass = aroused'), Text(758.4368231046932, 252.774, 'polysemy <= 0.5\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(756.4223826714802, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(760.4512635379062, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(772.5379061371842, 301.698, 'perceivability <= 0.793\ngini = 0.49\nsamples = 14\nvalue = [6, 8]\nclass = aroused'), Text(770.5234657039712, 285.39, 'masculinity <= 0.593\ngini = 0.444\nsamples = 12\nvalue = [4, 8]\nclass = aroused'), Text(768.5090252707582, 269.082, 'familiarity <= 0.769\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = not aroused'), Text(766.4945848375452, 252.774, 'dominance <= 0.602\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'), Text(764.4801444043322, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(768.5090252707582, 236.46599999999998, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(770.5234657039712, 252.774, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(772.5379061371842, 269.082, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(774.5523465703972, 285.39, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(810.1827617328521, 350.62199999999996, 'dominance <= 0.663\ngini = 0.184\nsamples = 263\nvalue = [27, 236]\nclass = aroused'), Text(789.408844765343, 334.31399999999996, 'perceivability <= 0.25\ngini = 0.332\nsamples = 95\nvalue = [20, 75]\nclass = aroused'), Text(782.6101083032491, 318.006, 'valence <= 0.896\ngini = 0.5\nsamples = 20\nvalue = [10, 10]\nclass = not aroused'), Text(780.5956678700362, 301.698, 'perceivability <= 0.212\ngini = 0.408\nsamples = 14\nvalue = [10, 4]\nclass = not aroused'), Text(778.5812274368232, 285.39, 'masculinity <= 0.369\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = not aroused'), Text(776.5667870036102, 269.082, 'valence <= 0.857\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = not aroused'), Text(774.5523465703972, 252.774, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = not aroused'), Text(778.5812274368232, 252.774, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(780.5956678700362, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = aroused'), Text(782.6101083032491, 285.39, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = not aroused'), Text(784.6245487364622, 301.698, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = aroused'), Text(796.2075812274369, 318.006, 'perceivability <= 0.451\ngini = 0.231\nsamples = 75\nvalue = [10, 65]\nclass = aroused'), Text(794.193140794224, 301.698, 'aoa <= 0.463\ngini = 0.34\nsamples = 46\nvalue = [10, 36]\nclass = aroused'), Text(786.6389891696751, 285.39, 'aoa <= 0.397\ngini = 0.497\nsamples = 13\nvalue = [6, 7]\nclass = aroused'), Text(784.6245487364622, 269.082, 'perceivability <= 0.349\ngini = 0.42\nsamples = 10\nvalue = [3, 7]\nclass = aroused'), Text(782.6101083032491, 252.774, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'), Text(786.6389891696751, 252.774, 'aoa <= 0.273\ngini = 0.48\nsamples = 5\nvalue = [3, 2]\nclass = not aroused'), Text(784.6245487364622, 236.46599999999998, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(788.6534296028882, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = aroused'), Text(788.6534296028882, 269.082, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = not aroused'), Text(801.7472924187726, 285.39, 'length <= 12.5\ngini = 0.213\nsamples = 33\nvalue = [4, 29]\nclass = aroused'), Text(799.7328519855596, 269.082, 'masculinity <= 0.168\ngini = 0.17\nsamples = 32\nvalue = [3, 29]\nclass = aroused'), Text(794.6967509025271, 252.774, 'familiarity <= 0.622\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(792.6823104693142, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(796.7111913357402, 236.46599999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(804.7689530685922, 252.774, 'dominance <= 0.596\ngini = 0.124\nsamples = 30\nvalue = [2, 28]\nclass = aroused'), Text(800.7400722021662, 236.46599999999998, 'perceivability <= 0.349\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(798.7256317689531, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(802.7545126353791, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(808.7978339350182, 236.46599999999998, 'length <= 9.0\ngini = 0.069\nsamples = 28\nvalue = [1, 27]\nclass = aroused'), Text(806.7833935018051, 220.158, 'gini = 0.0\nsamples = 23\nvalue = [0, 23]\nclass = aroused'), Text(810.8122743682311, 220.158, 'valence <= 0.907\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = aroused'), Text(808.7978339350182, 203.85, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = aroused'), Text(812.8267148014442, 203.85, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(803.7617328519857, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(798.22202166065, 301.698, 'gini = 0.0\nsamples = 29\nvalue = [0, 29]\nclass = aroused'), Text(830.9566787003611, 334.31399999999996, 'familiarity <= 0.404\ngini = 0.08\nsamples = 168\nvalue = [7, 161]\nclass = aroused'), Text(828.9422382671481, 318.006, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(832.9711191335741, 318.006, 'semsize <= 0.649\ngini = 0.069\nsamples = 167\nvalue = [6, 161]\nclass = aroused'), Text(830.9566787003611, 301.698, 'semsize <= 0.649\ngini = 0.219\nsamples = 48\nvalue = [6, 42]\nclass = aroused'), Text(828.9422382671481, 285.39, 'dominance <= 0.812\ngini = 0.19\nsamples = 47\nvalue = [5, 42]\nclass = aroused'), Text(826.9277978339351, 269.082, 'familiarity <= 0.901\ngini = 0.159\nsamples = 46\nvalue = [4, 42]\nclass = aroused'), Text(822.8989169675091, 252.774, 'perceivability <= 0.311\ngini = 0.097\nsamples = 39\nvalue = [2, 37]\nclass = aroused'), Text(820.8844765342961, 236.46599999999998, 'perceivability <= 0.303\ngini = 0.245\nsamples = 14\nvalue = [2, 12]\nclass = aroused'), Text(818.8700361010831, 220.158, 'length <= 8.0\ngini = 0.142\nsamples = 13\nvalue = [1, 12]\nclass = aroused'), Text(816.8555956678701, 203.85, 'gini = 0.0\nsamples = 11\nvalue = [0, 11]\nclass = aroused'), Text(820.8844765342961, 203.85, 'aoa <= 0.65\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = not aroused'), Text(818.8700361010831, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = aroused'), Text(822.8989169675091, 187.542, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(822.8989169675091, 220.158, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(824.9133574007221, 236.46599999999998, 'gini = 0.0\nsamples = 25\nvalue = [0, 25]\nclass = aroused'), Text(830.9566787003611, 252.774, 'aoa <= 0.283\ngini = 0.408\nsamples = 7\nvalue = [2, 5]\nclass = aroused'), Text(828.9422382671481, 236.46599999999998, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = aroused'), Text(832.9711191335741, 236.46599999999998, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = not aroused'), Text(830.9566787003611, 269.082, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(832.9711191335741, 285.39, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = not aroused'), Text(834.9855595667871, 301.698, 'gini = 0.0\nsamples = 119\nvalue = [0, 119]\nclass = aroused')]
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
plot_confusion_matrix(clf_dt, X_test, y_test, display_labels=["not aroused","aroused"])
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f87310de210>
y_pred = clf_dt.predict(X_train)
y_pred = clf_dt.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
y_score = clf_dt.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
Accuracy 0.7719897523484202
F1-score [0.8441331 0.57551669]
precision recall f1-score support
0.0 0.85 0.83 0.84 867
1.0 0.56 0.60 0.58 304
accuracy 0.77 1171
macro avg 0.71 0.71 0.71 1171
weighted avg 0.78 0.77 0.77 1171
0.7146523857220907
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=0.003)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
df=pd.DataFrame(data={'tree':range(10), 'accuracy':scores})
df.plot(x='tree', y='accuracy',marker='o',linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x7f87324c1410>
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x7f87320437d0>
alpha_results[(alpha_results['alpha']>0.0025)
&
(alpha_results['alpha']<0.005)]
| alpha | mean_accuracy | std | |
|---|---|---|---|
| 201 | 0.002712 | 0.825980 | 0.012110 |
| 202 | 0.002828 | 0.824838 | 0.009346 |
| 203 | 0.003022 | 0.823699 | 0.010039 |
| 204 | 0.003397 | 0.819710 | 0.015653 |
| 205 | 0.003625 | 0.819710 | 0.014745 |
| 206 | 0.003792 | 0.819425 | 0.014533 |
| 207 | 0.004665 | 0.817431 | 0.014101 |
ideal_ccp_alpha = 0.003242
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not aroused','aroused'])
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f8732590310>
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not aroused","aroused"],
feature_names=X.columns)
[Text(453.375, 366.93, 'valence <= 0.698\ngini = 0.372\nsamples = 3511\nvalue = [2645, 866]\nclass = not aroused'), Text(279.0, 285.39, 'semsize <= 0.638\ngini = 0.25\nsamples = 2829\nvalue = [2414, 415]\nclass = not aroused'), Text(139.5, 203.85000000000002, 'semsize <= 0.496\ngini = 0.164\nsamples = 2299\nvalue = [2092, 207]\nclass = not aroused'), Text(69.75, 122.31, 'gini = 0.101\nsamples = 1542\nvalue = [1460, 82]\nclass = not aroused'), Text(209.25, 122.31, 'gini = 0.276\nsamples = 757\nvalue = [632, 125]\nclass = not aroused'), Text(418.5, 203.85000000000002, 'dominance <= 0.608\ngini = 0.477\nsamples = 530\nvalue = [322, 208]\nclass = not aroused'), Text(348.75, 122.31, 'valence <= 0.402\ngini = 0.452\nsamples = 460\nvalue = [301, 159]\nclass = not aroused'), Text(279.0, 40.77000000000004, 'gini = 0.499\nsamples = 244\nvalue = [128, 116]\nclass = not aroused'), Text(418.5, 40.77000000000004, 'gini = 0.319\nsamples = 216\nvalue = [173, 43]\nclass = not aroused'), Text(488.25, 122.31, 'gini = 0.42\nsamples = 70\nvalue = [21, 49]\nclass = aroused'), Text(627.75, 285.39, 'dominance <= 0.585\ngini = 0.448\nsamples = 682\nvalue = [231, 451]\nclass = aroused'), Text(558.0, 203.85000000000002, 'gini = 0.494\nsamples = 269\nvalue = [149, 120]\nclass = not aroused'), Text(697.5, 203.85000000000002, 'valence <= 0.801\ngini = 0.318\nsamples = 413\nvalue = [82, 331]\nclass = aroused'), Text(627.75, 122.31, 'perceivability <= 0.326\ngini = 0.464\nsamples = 150\nvalue = [55, 95]\nclass = aroused'), Text(558.0, 40.77000000000004, 'gini = 0.469\nsamples = 56\nvalue = [35, 21]\nclass = not aroused'), Text(697.5, 40.77000000000004, 'gini = 0.335\nsamples = 94\nvalue = [20, 74]\nclass = aroused'), Text(767.25, 122.31, 'gini = 0.184\nsamples = 263\nvalue = [27, 236]\nclass = aroused')]
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
Accuracy 0.8129803586678053
F1-score 0.7877208681457515
precision recall f1-score support
0.0 0.82 0.97 0.88 867
1.0 0.79 0.38 0.51 304
accuracy 0.81 1171
macro avg 0.80 0.67 0.70 1171
weighted avg 0.81 0.81 0.79 1171
0.8427881988708796
refvar="valence"
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x7f87312c8e90>
alpha_results[(alpha_results['alpha']>0.0018)
&
(alpha_results['alpha']<0.0028)]
| alpha | mean_accuracy | std | |
|---|---|---|---|
| 140 | 0.001872 | 0.886080 | 0.017318 |
| 141 | 0.001910 | 0.885510 | 0.016445 |
| 142 | 0.002273 | 0.885795 | 0.016721 |
| 143 | 0.002309 | 0.885795 | 0.016721 |
| 144 | 0.002383 | 0.885795 | 0.016721 |
ideal_ccp_alpha = 0.002084
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not val","val"],
feature_names=X.columns)
[Text(397.575, 378.5785714285714, 'dominance <= 0.595\ngini = 0.354\nsamples = 3511\nvalue = [2703, 808]\nclass = not val'), Text(167.4, 320.3357142857143, 'masculinity <= 0.35\ngini = 0.225\nsamples = 2845\nvalue = [2477, 368]\nclass = not val'), Text(83.7, 262.09285714285716, 'arousal <= 0.35\ngini = 0.497\nsamples = 321\nvalue = [149, 172]\nclass = val'), Text(41.85, 203.85, 'gini = 0.313\nsamples = 72\nvalue = [58, 14]\nclass = not val'), Text(125.55000000000001, 203.85, 'dominance <= 0.52\ngini = 0.464\nsamples = 249\nvalue = [91, 158]\nclass = val'), Text(83.7, 145.60714285714283, 'aoa <= 0.287\ngini = 0.5\nsamples = 142\nvalue = [69, 73]\nclass = val'), Text(41.85, 87.3642857142857, 'gini = 0.289\nsamples = 40\nvalue = [7, 33]\nclass = val'), Text(125.55000000000001, 87.3642857142857, 'gini = 0.477\nsamples = 102\nvalue = [62, 40]\nclass = not val'), Text(167.4, 145.60714285714283, 'gini = 0.327\nsamples = 107\nvalue = [22, 85]\nclass = val'), Text(251.10000000000002, 262.09285714285716, 'arousal <= 0.453\ngini = 0.143\nsamples = 2524\nvalue = [2328, 196]\nclass = not val'), Text(209.25, 203.85, 'gini = 0.062\nsamples = 1777\nvalue = [1720, 57]\nclass = not val'), Text(292.95, 203.85, 'masculinity <= 0.507\ngini = 0.303\nsamples = 747\nvalue = [608, 139]\nclass = not val'), Text(251.10000000000002, 145.60714285714283, 'dominance <= 0.467\ngini = 0.484\nsamples = 244\nvalue = [144, 100]\nclass = not val'), Text(209.25, 87.3642857142857, 'gini = 0.288\nsamples = 103\nvalue = [85, 18]\nclass = not val'), Text(292.95, 87.3642857142857, 'semsize <= 0.485\ngini = 0.487\nsamples = 141\nvalue = [59, 82]\nclass = val'), Text(251.10000000000002, 29.121428571428567, 'gini = 0.415\nsamples = 51\nvalue = [36, 15]\nclass = not val'), Text(334.8, 29.121428571428567, 'gini = 0.38\nsamples = 90\nvalue = [23, 67]\nclass = val'), Text(334.8, 145.60714285714283, 'gini = 0.143\nsamples = 503\nvalue = [464, 39]\nclass = not val'), Text(627.75, 320.3357142857143, 'arousal <= 0.488\ngini = 0.448\nsamples = 666\nvalue = [226, 440]\nclass = val'), Text(502.20000000000005, 262.09285714285716, 'perceivability <= 0.463\ngini = 0.461\nsamples = 189\nvalue = [121, 68]\nclass = not val'), Text(460.35, 203.85, 'masculinity <= 0.447\ngini = 0.487\nsamples = 105\nvalue = [44, 61]\nclass = val'), Text(418.5, 145.60714285714283, 'gini = 0.185\nsamples = 29\nvalue = [3, 26]\nclass = val'), Text(502.20000000000005, 145.60714285714283, 'arousal <= 0.438\ngini = 0.497\nsamples = 76\nvalue = [41, 35]\nclass = not val'), Text(460.35, 87.3642857142857, 'gini = 0.436\nsamples = 56\nvalue = [38, 18]\nclass = not val'), Text(544.0500000000001, 87.3642857142857, 'gini = 0.255\nsamples = 20\nvalue = [3, 17]\nclass = val'), Text(544.0500000000001, 203.85, 'gini = 0.153\nsamples = 84\nvalue = [77, 7]\nclass = not val'), Text(753.3000000000001, 262.09285714285716, 'masculinity <= 0.684\ngini = 0.343\nsamples = 477\nvalue = [105, 372]\nclass = val'), Text(711.45, 203.85, 'arousal <= 0.575\ngini = 0.236\nsamples = 409\nvalue = [56, 353]\nclass = val'), Text(669.6, 145.60714285714283, 'masculinity <= 0.551\ngini = 0.447\nsamples = 83\nvalue = [28, 55]\nclass = val'), Text(627.75, 87.3642857142857, 'gini = 0.368\nsamples = 70\nvalue = [17, 53]\nclass = val'), Text(711.45, 87.3642857142857, 'gini = 0.26\nsamples = 13\nvalue = [11, 2]\nclass = not val'), Text(753.3000000000001, 145.60714285714283, 'gini = 0.157\nsamples = 326\nvalue = [28, 298]\nclass = val'), Text(795.15, 203.85, 'gini = 0.403\nsamples = 68\nvalue = [49, 19]\nclass = not val')]
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not val','val'])
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
Accuracy 0.8736122971818958
F1-score 0.8681346409232079
precision recall f1-score support
0.0 0.88 0.95 0.92 864
1.0 0.83 0.65 0.73 307
accuracy 0.87 1171
macro avg 0.86 0.80 0.82 1171
weighted avg 0.87 0.87 0.87 1171
0.8893394106647363
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
refvar="dominance"
taglio=0.57
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=42, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
yerr='std',
marker='o',
linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x7f87323e1e10>
alpha_results[(alpha_results['alpha']>0.0018)
&
(alpha_results['alpha']<0.0041)]
| alpha | mean_accuracy | std | |
|---|---|---|---|
| 197 | 0.001963 | 0.839931 | 0.007957 |
| 198 | 0.002448 | 0.837366 | 0.010027 |
| 199 | 0.002703 | 0.837364 | 0.011967 |
| 200 | 0.002885 | 0.835655 | 0.013418 |
| 201 | 0.003540 | 0.838219 | 0.014275 |
| 202 | 0.003945 | 0.838219 | 0.014275 |
| 203 | 0.003988 | 0.838219 | 0.014275 |
ideal_ccp_alpha = 0.001963
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not dominant','dominant'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not dominant","dominant"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
Accuracy 0.8300597779675492
F1-score 0.6223908918406071
precision recall f1-score support
0.0 0.86 0.92 0.89 879
1.0 0.70 0.56 0.62 292
accuracy 0.83 1171
macro avg 0.78 0.74 0.76 1171
weighted avg 0.82 0.83 0.82 1171
0.8587416429005564
refvar="familiarity"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x7f8732336c50>
alpha_results[(alpha_results['alpha']>0.0009)
&
(alpha_results['alpha']<0.0011)]
| alpha | mean_accuracy | std | |
|---|---|---|---|
| 182 | 0.000904 | 0.824559 | 0.022300 |
| 183 | 0.000911 | 0.826553 | 0.021298 |
| 184 | 0.000930 | 0.829968 | 0.019192 |
| 185 | 0.000944 | 0.830822 | 0.018603 |
| 186 | 0.000977 | 0.831677 | 0.019085 |
| 187 | 0.001006 | 0.831108 | 0.018665 |
| 188 | 0.001008 | 0.831108 | 0.018665 |
| 189 | 0.001015 | 0.830539 | 0.018191 |
| 190 | 0.001017 | 0.830254 | 0.018189 |
| 191 | 0.001020 | 0.829399 | 0.017752 |
| 192 | 0.001047 | 0.829399 | 0.017475 |
| 193 | 0.001051 | 0.829399 | 0.017475 |
| 194 | 0.001082 | 0.828829 | 0.016620 |
ideal_ccp_alpha = 0.000977
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not dominant','dominant'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not valueable","valueable"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
Accuracy 0.8121263877028181
F1-score 0.8692033293697977
precision recall f1-score support
0.0 0.64 0.69 0.67 317
1.0 0.88 0.86 0.87 854
accuracy 0.81 1171
macro avg 0.76 0.77 0.77 1171
weighted avg 0.82 0.81 0.81 1171
0.8300814870086215
refvar="semsize"
taglio=0.63
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x7f8731d5dc10>
alpha_results[(alpha_results['alpha']>0.0015)
&
(alpha_results['alpha']<0.002)]
| alpha | mean_accuracy | std | |
|---|---|---|---|
| 229 | 0.001589 | 0.800630 | 0.022148 |
| 230 | 0.001601 | 0.801199 | 0.022718 |
| 231 | 0.001606 | 0.801199 | 0.022718 |
| 232 | 0.001663 | 0.797781 | 0.024255 |
| 233 | 0.001754 | 0.797211 | 0.023980 |
| 234 | 0.001766 | 0.797211 | 0.023980 |
| 235 | 0.001851 | 0.795502 | 0.027092 |
| 236 | 0.001869 | 0.795502 | 0.027092 |
| 237 | 0.001922 | 0.795217 | 0.026902 |
| 238 | 0.001968 | 0.795217 | 0.026902 |
ideal_ccp_alpha = 0.001601
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['small','big'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["small","big"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
Accuracy 0.7950469684030743
F1-score [0.87261146 0.47598253]
precision recall f1-score support
0.0 0.81 0.95 0.87 865
1.0 0.72 0.36 0.48 306
accuracy 0.80 1171
macro avg 0.76 0.65 0.67 1171
weighted avg 0.78 0.80 0.77 1171
0.7947183497676528
refvar="masculinity"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
<matplotlib.axes._subplots.AxesSubplot at 0x7f87322800d0>
alpha_results[(alpha_results['alpha']>0.0015)
&
(alpha_results['alpha']<0.0025)]
| alpha | mean_accuracy | std | |
|---|---|---|---|
| 252 | 0.001542 | 0.751076 | 0.016979 |
| 253 | 0.001555 | 0.751361 | 0.017161 |
| 254 | 0.001591 | 0.751645 | 0.016813 |
| 255 | 0.001603 | 0.752215 | 0.017575 |
| 256 | 0.001677 | 0.753923 | 0.013112 |
| 257 | 0.001754 | 0.751363 | 0.016140 |
| 258 | 0.001859 | 0.753642 | 0.018351 |
| 259 | 0.001972 | 0.754779 | 0.016440 |
| 260 | 0.001985 | 0.754779 | 0.015887 |
| 261 | 0.002007 | 0.754779 | 0.015887 |
| 262 | 0.002352 | 0.753066 | 0.011932 |
| 263 | 0.002381 | 0.750217 | 0.008731 |
ideal_ccp_alpha = 0.001985
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['feminine','masculine'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["feminine","masculine"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator. warnings.warn(msg, category=FutureWarning)
Accuracy 0.7762596071733561
F1-score 0.5130111524163568
precision recall f1-score support
0.0 0.83 0.88 0.85 873
1.0 0.57 0.46 0.51 298
accuracy 0.78 1171
macro avg 0.70 0.67 0.68 1171
weighted avg 0.76 0.78 0.77 1171
0.7482279726623462
refvar="polysemy"
taglio=0.63
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-173-fd3496329b61> in <module>() 48 for ccp_alpha in ccp_alphas: 49 clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha) ---> 50 scores= cross_val_score(clf_dt,X_train,y_train, cv=10) 51 alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)]) 52 /usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score) 519 fit_params=fit_params, 520 pre_dispatch=pre_dispatch, --> 521 error_score=error_score, 522 ) 523 return cv_results["test_score"] /usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score) 281 error_score=error_score, 282 ) --> 283 for train, test in cv.split(X, y, groups) 284 ) 285 /usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self, iterable) 1044 self._iterating = self._original_iterator is not None 1045 -> 1046 while self.dispatch_one_batch(iterator): 1047 pass 1048 /usr/local/lib/python3.7/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 859 return False 860 else: --> 861 self._dispatch(tasks) 862 return True 863 /usr/local/lib/python3.7/dist-packages/joblib/parallel.py in _dispatch(self, batch) 777 with self._lock: 778 job_idx = len(self._jobs) --> 779 job = self._backend.apply_async(batch, callback=cb) 780 # A job can complete so quickly than its callback is 781 # called before we get here, causing self._jobs to /usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 206 def apply_async(self, func, callback=None): 207 """Schedule a func to be run""" --> 208 result = ImmediateResult(func) 209 if callback: 210 callback(result) /usr/local/lib/python3.7/dist-packages/joblib/_parallel_backends.py in __init__(self, batch) 570 # Don't delay the application, to avoid keeping the input 571 # arguments in memory --> 572 self.results = batch() 573 574 def get(self): /usr/local/lib/python3.7/dist-packages/joblib/parallel.py in __call__(self) 261 with parallel_backend(self._backend, n_jobs=self._n_jobs): 262 return [func(*args, **kwargs) --> 263 for func, args, kwargs in self.items] 264 265 def __reduce__(self): /usr/local/lib/python3.7/dist-packages/joblib/parallel.py in <listcomp>(.0) 261 with parallel_backend(self._backend, n_jobs=self._n_jobs): 262 return [func(*args, **kwargs) --> 263 for func, args, kwargs in self.items] 264 265 def __reduce__(self): /usr/local/lib/python3.7/dist-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs) 209 def __call__(self, *args, **kwargs): 210 with config_context(**self.config): --> 211 return self.function(*args, **kwargs) 212 213 /usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score) 679 estimator.fit(X_train, **fit_params) 680 else: --> 681 estimator.fit(X_train, y_train, **fit_params) 682 683 except Exception: /usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted) 940 sample_weight=sample_weight, 941 check_input=check_input, --> 942 X_idx_sorted=X_idx_sorted, 943 ) 944 return self /usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted) 418 ) 419 --> 420 builder.build(self.tree_, X, y, sample_weight) 421 422 if self.n_outputs_ == 1 and is_classifier(self): KeyboardInterrupt:
alpha_results[(alpha_results['alpha']>0.001)
&
(alpha_results['alpha']<0.004)]
ideal_ccp_alpha = 0.001944
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not pol','pol'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not pol","pol"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
refvar="perceivability"
taglio=0.8
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
alpha_results[(alpha_results['alpha']>0.0014)
&
(alpha_results['alpha']<0.002)]
ideal_ccp_alpha = 0.001499
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not peveivable','perveivable'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not perceivable","perceivable"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
refvar="aoa"
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
alpha_results[(alpha_results['alpha']>0.0018)
&
(alpha_results['alpha']<0.0025)]
ideal_ccp_alpha = 0.001858
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['0-2','2-4','4-6','6-8','8-10','10-12'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=['0-2','2-4','4-6','6-8','8-10','10-12'],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average=None))
print(classification_report(y_test, y_pred))
print(clf_dt_pruned.predict_proba(X_test))
report = classification_report(y_test, y_pred, output_dict=True)
export = pd.DataFrame(report).transpose()
print(export.to_latex())
plt.show()
refvar="aoa"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='-')
alpha_results[(alpha_results['alpha']>0.0025)
&
(alpha_results['alpha']<0.005)]
ideal_ccp_alpha = 0.003224
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['younger','older'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["younger","older"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
refvar="web_corpus_freq"
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
yerr='std',
marker='o',
linestyle='--')
alpha_results[(alpha_results['alpha']>0.0013)
&
(alpha_results['alpha']<0.0016)]
ideal_ccp_alpha = 0.001376
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['4','5','6','7','8','9'],
)
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=['4','5','6','7','8','9'],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average=None))
print(classification_report(y_test, y_pred))
print(clf_dt_pruned.predict_proba(X_test))
report = classification_report(y_test, y_pred, output_dict=True)
export = pd.DataFrame(report).transpose()
print(export.to_latex())
df_class_aoa= dfprepro.copy()
df_class_aoa.head()
dataframe = [df_class_aoa]
for dataset in dataframe:
dataset.loc[(dataset["aoa"] > 1) & (dataset["aoa"] <= 2), "aoa"] = 1
dataset.loc[(dataset["aoa"] > 2)& (dataset["aoa"] <= 3), "aoa"] = 2
dataset.loc[(dataset["aoa"] > 3)& (dataset["aoa"] <= 4), "aoa"] = 3
dataset.loc[(dataset["aoa"] > 4)& (dataset["aoa"] <= 5), "aoa"] = 4
dataset.loc[(dataset["aoa"] > 5)& (dataset["aoa"] <= 6), "aoa"] = 5
dataset.loc[(dataset["aoa"] > 6)&( dataset["aoa"] <= 7), "aoa"] = 6
dataset.loc[(dataset["aoa"] > 7), "aoa"] = 7
df_class_aoa.head()
attributes = [col for col in df_class_aoa.columns if col != 'aoa']
X = df_class_aoa[attributes].values
y = df_class_aoa['aoa']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=100)
len(df_class_aoa), X_train.shape[0], X_test.shape[0]
X=df_class_aoa.drop("aoa",axis=1).copy()
X.dtypes
y_encoded=pd.get_dummies(y,columns=["aoa"]).head()
y_encoded.dtypes
X_train.shape, X_test.shape
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1)
clf.fit(X_train, y_train)
for col, imp in zip(attributes, clf.feature_importances_): print(col, imp)
dot_data = tree.export_graphviz (clf, out_file=None,
feature_names=attributes,
class_names=[str(v) for v in clf.classes_],
filled=True, rounded=True,
special_characters=True,
max_depth=2)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# apply decision tree to train set
y_pred = clf.predict(X_train)
y_pred[:5]
y_train.values[:5]
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average=None))
print( classification_report(y_train, y_pred) )
# Confusion matrix for trainset
# TP, FN, FP, TN
confusion_matrix(y_train, y_pred)
# apply decision tree to test set
y_pred = clf.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)
y_score = clf.predict_proba(X_test)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
refvar="length"
taglio=0.35
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
marker='o',
linestyle='--')
alpha_results[(alpha_results['alpha']>0.001)
&
(alpha_results['alpha']<0.0025)]
ideal_ccp_alpha = 0.001538
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['short','long'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["short","long"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
from sklearn.neighbors import KNeighborsClassifier
refvar="aoa"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
error_rate = []
for i in range(1,100):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed',
marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
yhat = neigh.predict(X_test)
acc.append(metrics.accuracy_score(y_test, yhat))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed',
marker='o',markerfacecolor='red', markersize=10)
plt.title('accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))
from sklearn.neighbors import KNeighborsClassifier
refvar="aoa"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=24)
clf_knn.fit(X, y)
# apply KNN to train set
y_pred = clf_knn.predict(X_train)
y_pred[:5]
y_train.values[:5]
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )
# Confusion matrix for trainset
# TP, FN, FP, TN
confusion_matrix(y_train, y_pred)
# apply KNN to test set
y_pred = clf_knn.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(clf_knn,
X_test,
y_test,
display_labels=['younger','older'])
y_score = clf_knn.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
from sklearn.neighbors import KNeighborsClassifier
refvar="valence"
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
error_rate = []
for i in range(1,100):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed',
marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
yhat = neigh.predict(X_test)
acc.append(metrics.accuracy_score(y_test, yhat))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed',
marker='o',markerfacecolor='red', markersize=10)
plt.title('accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))
refvar="valence"
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=12)
clf_knn.fit(X, y)
y_pred = clf_knn.predict(X_train)
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )
confusion_matrix(y_train, y_pred)
y_pred = clf_knn.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(clf_knn,
X_test,
y_test,
display_labels=['not valuable','valuable'])
y_score = clf_knn.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
from sklearn.neighbors import KNeighborsClassifier
refvar="polysemy"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
error_rate = []
for i in range(1,100):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed',
marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
yhat = neigh.predict(X_test)
acc.append(metrics.average_precision_score(y_test, yhat))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed',
marker='o',markerfacecolor='red', markersize=10)
plt.title('F1 score vs. K Value')
plt.xlabel('K')
plt.ylabel('F1 Score')
print("Maximum F1:-",max(acc),"at K =",acc.index(max(acc)))
refvar="polysemy"
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=6)
clf_knn.fit(X, y)
y_pred = clf_knn.predict(X_train)
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )
confusion_matrix(y_train, y_pred)
y_pred = clf_knn.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(clf_knn,
X_test,
y_test,
display_labels=['not polysemic','polysemic'])
y_score = clf_knn.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
from sklearn.neighbors import KNeighborsClassifier
refvar="aoa"
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
Pred_y = neigh.predict(X_test)
error_rate = []
for i in range(1,100):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,100),error_rate,color='blue', linestyle='dashed',
marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))
clf_knn = KNeighborsClassifier(n_neighbors=37)
clf_knn.fit(X, y)
y_pred = clf_knn.predict(X_train)
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average='weighted'))
print( classification_report(y_train, y_pred) )
confusion_matrix(y_train, y_pred)
y_pred = clf_knn.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(clf_knn,
X_test,
y_test,
display_labels=['0-2','2-4','4-6','6-8','8-10'])
y_score = clf_knn.predict_proba(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
export = pd.DataFrame(report).transpose()
print(export.to_latex())
from sklearn.ensemble import RandomForestClassifier
refvar='valence'
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Instantiate model with 10 decision trees
model = RandomForestClassifier(n_estimators = 380, random_state = 42)
# Train the model on training data
ra=model.fit(X_train, y_train)
#TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE
prediction_test = model.predict(X_test)
#print(y_test, prediction_test)
#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data
#One amazing feature of Random forest is that it provides us info on feature importances
# Get numerical feature importances
#importances = list(model.feature_importances_)
#Let us print them into a nice format.
feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)
y_pred = model.predict(X_train)
y_pred = model.predict(X_test)
plot_confusion_matrix(ra,
X_test,
y_test,
display_labels=['not val','val'],
)
y_score = model.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))
refvar='polysemy'
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Instantiate model with 10 decision trees
model = RandomForestClassifier(n_estimators = 385, random_state = 42)
# Train the model on training data
ra=model.fit(X_train, y_train)
#TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE
prediction_test = model.predict(X_test)
#print(y_test, prediction_test)
#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data
#One amazing feature of Random forest is that it provides us info on feature importances
# Get numerical feature importances
#importances = list(model.feature_importances_)
#Let us print them into a nice format.
feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)
y_pred = model.predict(X_train)
y_pred = model.predict(X_test)
plot_confusion_matrix(ra,
X_test,
y_test,
display_labels=['not pol','pol']
)
y_score = model.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))
refvar='aoa'
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Instantiate model with 10 decision trees
model = RandomForestClassifier(n_estimators = 380, random_state = 42)
# Train the model on training data
ra=model.fit(X_train, y_train)
#TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE
prediction_test = model.predict(X_test)
#print(y_test, prediction_test)
#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data
#One amazing feature of Random forest is that it provides us info on feature importances
# Get numerical feature importances
#importances = list(model.feature_importances_)
#Let us print them into a nice format.
feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)
y_pred = model.predict(X_train)
y_pred = model.predict(X_test)
plot_confusion_matrix(ra,
X_test,
y_test,
display_labels=['younger','older']
)
y_score = model.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred,average='weighted'))
print(classification_report(y_test, y_pred))
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
RANDOM_STATE = 42
refvar='valence'
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.
ensemble_clfs = [
(
"RandomForestClassifier, max_features='sqrt'",
RandomForestClassifier(
warm_start=True,
oob_score=True,
max_features="sqrt",
random_state=RANDOM_STATE,
),
),
(
"RandomForestClassifier, max_features='log2'",
RandomForestClassifier(
warm_start=True,
max_features="log2",
oob_score=True,
random_state=RANDOM_STATE,
),
),
(
"RandomForestClassifier, max_features=None",
RandomForestClassifier(
warm_start=True,
max_features=None,
oob_score=True,
random_state=RANDOM_STATE,
),
),
]
# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# Range of `n_estimators` values to explore.
min_estimators = 100
max_estimators = 1000
for label, clf in ensemble_clfs:
for i in range(min_estimators, max_estimators + 1, 5):
clf.set_params(n_estimators=i)
clf.fit(X, y)
# Record the OOB error for each `n_estimators=i` setting.
oob_error = 1 - clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()
refvar="aoa"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='entropy',random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
yerr='std',
marker='o',
linestyle='-')
alpha_results[(alpha_results['alpha']>0.002)
&
(alpha_results['alpha']<0.004)]
ideal_ccp_alpha = 0.002246
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['young','old'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["young","old"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
refvar="polysemy"
taglio=0.6
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='entropy',random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
yerr='std',
marker='o',
linestyle='-')
alpha_results[(alpha_results['alpha']>0.002)
]
ideal_ccp_alpha = 0.003002
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not polysemic','polysemic'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not polysemic","polysemic"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
refvar="valence"
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42)
clf_dt = clf_dt.fit(X_train, y_train)
path = clf_dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]
clf_dts=[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf_dt.fit(X_train, y_train)
clf_dts.append(clf_dt)
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
fig, ax =plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores, marker ='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores, marker ='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
alpha_loop_values =[]
for ccp_alpha in ccp_alphas:
clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=0, ccp_alpha=ccp_alpha)
scores= cross_val_score(clf_dt,X_train,y_train, cv=10)
alpha_loop_values.append([ccp_alpha,np.mean(scores), np.std(scores)])
alpha_results = pd.DataFrame(alpha_loop_values,
columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',
y='mean_accuracy',
yerr='std',
marker='o',
linestyle='-')
alpha_results[(alpha_results['alpha']>0.0025)
&
(alpha_results['alpha']<0.0035)]
ideal_ccp_alpha = 0.003041
ideal_ccp_alpha = float(ideal_ccp_alpha)
clf_dt_pruned = DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1,random_state=42, ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not valuable','valuable'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not valuable","valuable"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
###### Lecture Pipeline ######
# you can/have to modify the order for your project,
# e.g. hyperparameter tuning before evaluating a decision tree
# Hold-out
from sklearn.model_selection import train_test_split
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
# Optional: KNN
from sklearn.neighbors import KNeighborsClassifier
##### Decision Tree Visualization ###
from sklearn import tree # export_graphviz() method
import pydotplus # viz stuff
from IPython.display import Image # viz stuff
####################################
######## Evaluation #######
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
# Optional: baseline comparison
from sklearn.dummy import DummyClassifier
###########################
# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# cross-validation
from sklearn.model_selection import cross_val_score
refvar="valence"
taglio=0.67
X=df_class_ref.drop(refvar,axis=1).copy()
y=df_class_ref[refvar].copy()
y_up_index = y >= taglio
y[y_up_index]=1
y_zero_index = y < taglio
y[y_zero_index]=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf = DecisionTreeClassifier(criterion='gini',
max_depth=None ,
min_samples_split=10,
min_samples_leaf=5,
)
clf.fit(X_train, y_train)
imp_dict = {attr: clf.feature_importances_[i] for i, attr in enumerate(attributes)}
imp_dict
{k: v for k, v in sorted(imp_dict.items(), key=lambda item: item[1], reverse=True)}
dot_data = tree.export_graphviz (clf, out_file=None,
feature_names=attributes,
class_names=[str(v) for v in clf.classes_],
filled=True, rounded=True,
special_characters=True,
max_depth=2)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
y_pred = clf.predict(X_train)
print('Accuracy', accuracy_score(y_train, y_pred))
print('F1', f1_score(y_train, y_pred, average=None))
print( classification_report(y_train, y_pred) )
# NOT SUGGESTED FOR THE PROJECT
cf = confusion_matrix(y_train, y_pred)
cf
y_pred = clf.predict( X_test )
y_score = clf.predict_proba(X_test)
y_score[:6]
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
dummy_clf = DummyClassifier(strategy='constant', constant=1)
dummy_clf.fit(X_test, y_test)
y_pred = dummy_clf.predict(X_test)
param_list = {'max_depth': [None] + [2, 3, 4],
'min_samples_split': [2, 5, 10, 20],
'min_samples_leaf': [1, 5, 10, 20]
}
grid_search = GridSearchCV(clf, param_grid=param_list, scoring='f1')
grid_search.fit(X, y)
res = grid_search.cv_results_
grid_search.best_estimator_
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
report(res, n_top=3)
clf_dt_pruned = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=5, min_samples_leaf=20)
clf_dt_pruned = clf_dt_pruned.fit(X_train, y_train)
plot_confusion_matrix(clf_dt_pruned,
X_test,
y_test,
display_labels=['not valuable','valuable'])
plt.figure(figsize=(15,7.5))
from sklearn.tree import plot_tree
plot_tree(clf_dt_pruned,
filled=True,
rounded=True,
class_names=["not valuable","valuable"],
feature_names=X.columns)
y_pred = clf_dt_pruned.predict(X_train)
y_pred = clf_dt_pruned.predict(X_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
y_score = clf_dt_pruned.predict_proba(X_test)
fpr, tpr, th = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.figure(figsize=(8,5))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()
dfprepro.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | 4.200 | 2.864 | 4.333 | 2.382 | 6.760 | 4.652 | 5.391 | 0 | 5.0 | 4.9230 |
| 1 | 5 | 3.125 | 5.781 | 4.667 | 3.324 | 5.177 | 5.121 | 3.303 | 0 | 6.0 | 5.6250 |
| 2 | 10 | 3.273 | 5.250 | 5.235 | 5.121 | 5.543 | 2.667 | 3.971 | 0 | 5.0 | 3.2315 |
| 3 | 8 | 4.194 | 3.767 | 4.419 | 3.971 | 6.233 | 4.679 | 5.167 | 0 | 5.0 | 2.9415 |
| 4 | 10 | 3.846 | 3.880 | 4.800 | 3.097 | 6.407 | 5.083 | 4.571 | 0 | 5.0 | 2.9315 |
dfprepro.describe()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 | 4682.000000 |
| mean | 6.348355 | 4.678129 | 5.086797 | 5.044939 | 5.271335 | 4.143427 | 4.136403 | 4.099933 | 0.080948 | 6.285135 | 4.644645 |
| std | 2.006230 | 1.097163 | 1.594344 | 0.930669 | 0.921218 | 1.252770 | 1.023293 | 0.912293 | 0.272785 | 0.843987 | 1.366345 |
| min | 2.000000 | 2.057000 | 1.030000 | 1.941000 | 1.647000 | 1.219000 | 1.375000 | 1.000000 | 0.000000 | 4.000000 | 1.803500 |
| 25% | 5.000000 | 3.849000 | 4.115000 | 4.529000 | 4.706000 | 3.114000 | 3.438000 | 3.606000 | 0.000000 | 6.000000 | 3.407000 |
| 50% | 6.000000 | 4.571000 | 5.290000 | 5.123000 | 5.438000 | 4.177000 | 4.186500 | 4.121000 | 0.000000 | 6.000000 | 4.562500 |
| 75% | 8.000000 | 5.419000 | 6.088000 | 5.600000 | 5.969000 | 5.152000 | 4.882000 | 4.656000 | 0.000000 | 7.000000 | 5.970500 |
| max | 16.000000 | 8.177000 | 8.647000 | 8.371000 | 6.939000 | 6.971000 | 6.912000 | 6.971000 | 1.000000 | 9.000000 | 6.925000 |
df_pm= dfprepro.copy()
var_to_scale=['length','aoa',"arousal","valence","dominance","familiarity","semsize","masculinity","perceivability"]
features = df_pm[var_to_scale]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)
df_pm[var_to_scale] = 4*features-0.5
df_pm.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.214286 | 0.900654 | 0.463109 | 0.988025 | 0.055556 | 3.353268 | 1.867347 | 2.441551 | 0 | 5.0 | 1.936396 |
| 1 | 0.357143 | 0.198039 | 1.994946 | 1.195801 | 0.767574 | 2.252434 | 2.206159 | 1.042790 | 0 | 6.0 | 2.484672 |
| 2 | 1.785714 | 0.294771 | 1.716096 | 1.549145 | 2.125850 | 2.506954 | 0.433357 | 1.490286 | 0 | 5.0 | 0.615298 |
| 3 | 1.214286 | 0.896732 | 0.937311 | 1.041524 | 1.256614 | 2.986787 | 1.886852 | 2.291492 | 0 | 5.0 | 0.388802 |
| 4 | 1.785714 | 0.669281 | 0.996652 | 1.278538 | 0.595994 | 3.107789 | 2.178707 | 1.892229 | 0 | 5.0 | 0.380992 |
df_pm=df_pm.round()
df_pm.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 3.0 | 2.0 | 2.0 | 0 | 5.0 | 2.0 |
| 1 | 0.0 | 0.0 | 2.0 | 1.0 | 1.0 | 2.0 | 2.0 | 1.0 | 0 | 6.0 | 2.0 |
| 2 | 2.0 | 0.0 | 2.0 | 2.0 | 2.0 | 3.0 | 0.0 | 1.0 | 0 | 5.0 | 1.0 |
| 3 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | 2.0 | 2.0 | 0 | 5.0 | 0.0 |
| 4 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | 2.0 | 2.0 | 0 | 5.0 | 0.0 |
df_pm.isnull().sum()
length 0 arousal 0 valence 0 dominance 0 familiarity 0 aoa 0 semsize 0 masculinity 0 polysemy 0 web_corpus_freq 14 perceivability 0 dtype: int64
df_pm_copy=df_pm.copy()
df_pm.loc[(df_pm['web_corpus_freq'].isnull() == True), 'web_corpus_freq'] = df_pm['web_corpus_freq'].mean()
df_pm['length'] = df_pm['length'].astype(str) + '_Lenght'
df_pm['arousal'] = df_pm['arousal'].astype(str) + '_Arousal'
df_pm['valence'] = df_pm['valence'].astype(str) + '_Valence'
df_pm['dominance'] = df_pm['dominance'].astype(str) + '_Dominance'
df_pm['familiarity'] = df_pm['familiarity'].astype(str) + '_Familiarity'
df_pm['aoa'] = df_pm['aoa'].astype(str) + '_Age_of_Aquisition'
df_pm['semsize'] = df_pm['semsize'].astype(str) + '_SemSize'
df_pm['masculinity'] = df_pm['masculinity'].astype(str) + '_Masculinity'
df_pm['web_corpus_freq'] = df_pm['web_corpus_freq'].astype(str) + '_Web_Corpus_Freq'
df_pm['perceivability'] = df_pm['perceivability'].astype(str) + '_Perceivability'
df_pm.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0_Lenght | 1.0_Arousal | 0.0_Valence | 1.0_Dominance | 0.0_Familiarity | 3.0_Age_of_Aquisition | 2.0_SemSize | 2.0_Masculinity | 0 | 5.0_Web_Corpus_Freq | 2.0_Perceivability |
| 1 | 0.0_Lenght | 0.0_Arousal | 2.0_Valence | 1.0_Dominance | 1.0_Familiarity | 2.0_Age_of_Aquisition | 2.0_SemSize | 1.0_Masculinity | 0 | 6.0_Web_Corpus_Freq | 2.0_Perceivability |
| 2 | 2.0_Lenght | 0.0_Arousal | 2.0_Valence | 2.0_Dominance | 2.0_Familiarity | 3.0_Age_of_Aquisition | 0.0_SemSize | 1.0_Masculinity | 0 | 5.0_Web_Corpus_Freq | 1.0_Perceivability |
| 3 | 1.0_Lenght | 1.0_Arousal | 1.0_Valence | 1.0_Dominance | 1.0_Familiarity | 3.0_Age_of_Aquisition | 2.0_SemSize | 2.0_Masculinity | 0 | 5.0_Web_Corpus_Freq | 0.0_Perceivability |
| 4 | 2.0_Lenght | 1.0_Arousal | 1.0_Valence | 1.0_Dominance | 1.0_Familiarity | 3.0_Age_of_Aquisition | 2.0_SemSize | 2.0_Masculinity | 0 | 5.0_Web_Corpus_Freq | 0.0_Perceivability |
polysemy_dict = {0: 'Not Polysemy', 1: 'Polysemy'}
df_pm['polysemy'] = df_pm['polysemy'].map(polysemy_dict)
df_pm.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0_Lenght | 1.0_Arousal | 0.0_Valence | 1.0_Dominance | 0.0_Familiarity | 3.0_Age_of_Aquisition | 2.0_SemSize | 2.0_Masculinity | Not Polysemy | 5.0_Web_Corpus_Freq | 2.0_Perceivability |
| 1 | 0.0_Lenght | 0.0_Arousal | 2.0_Valence | 1.0_Dominance | 1.0_Familiarity | 2.0_Age_of_Aquisition | 2.0_SemSize | 1.0_Masculinity | Not Polysemy | 6.0_Web_Corpus_Freq | 2.0_Perceivability |
| 2 | 2.0_Lenght | 0.0_Arousal | 2.0_Valence | 2.0_Dominance | 2.0_Familiarity | 3.0_Age_of_Aquisition | 0.0_SemSize | 1.0_Masculinity | Not Polysemy | 5.0_Web_Corpus_Freq | 1.0_Perceivability |
| 3 | 1.0_Lenght | 1.0_Arousal | 1.0_Valence | 1.0_Dominance | 1.0_Familiarity | 3.0_Age_of_Aquisition | 2.0_SemSize | 2.0_Masculinity | Not Polysemy | 5.0_Web_Corpus_Freq | 0.0_Perceivability |
| 4 | 2.0_Lenght | 1.0_Arousal | 1.0_Valence | 1.0_Dominance | 1.0_Familiarity | 3.0_Age_of_Aquisition | 2.0_SemSize | 2.0_Masculinity | Not Polysemy | 5.0_Web_Corpus_Freq | 0.0_Perceivability |
X = df_pm.values.tolist()
X[0]
['8_Lenght', '3.0_Arousal', '2.0_Valence', '4.0_Dominance', '2.0_Familiarity', '6.0_Age_of_Aquisition', '4.0_SemSize', '5.0_Masculinity', 'Not Polysemy', '5.0_Web_Corpus_Freq', '4.0_Perceivability']
help(apriori)
Help on built-in function apriori in module fim:
apriori(...)
apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
border=None)
Find frequent item sets with the Apriori algorithm.
tracts transaction database to mine (mandatory)
The database must be an iterable of transactions;
each transaction must be an iterable of items;
each item must be a hashable object.
If the database is a dictionary, the transactions are
the keys, the values their (integer) multiplicities.
target type of frequent item sets to find (default: s)
s/a sets/all all frequent item sets
c closed closed frequent item sets
m maximal maximal frequent item sets
g gens generators
r rules association rules
supp minimum support of an item set (default: 10)
(positive: percentage, negative: absolute number)
conf minimum confidence of an assoc. rule (default: 80%)
zmin minimum number of items per item set (default: 1)
zmax maximum number of items per item set (default: no limit)
report values to report with an item set (default: a)
a absolute item set support (number of transactions)
s relative item set support as a fraction
S relative item set support as a percentage
e value of item set evaluation measure
E value of item set evaluation measure as a percentage
( combine values in a tuple (must be first character)
[ combine values in a list (must be first character)
# pattern spectrum as a dictionary (no patterns)
= pattern spectrum as a list (no patterns)
| pattern spectrum as three columns (no patterns)
for target 'r' (association rules) also available:
b absolute body set support (number of transactions)
x relative body set support as a fraction
X relative body set support as a percentage
h absolute head item support (number of transactions)
y relative head item support as a fraction
Y relative head item support as a percentage
c rule confidence as a fraction
C rule confidence as a percentage
l lift value of a rule (confidence/prior)
L lift value of a rule as a percentage
Q support of the empty set (total number of transactions)
eval measure for item set evaluation (default: x)
x none no measure / zero (default)
b ldratio binary logarithm of support quotient (+)
c conf rule confidence (+)
d confdiff absolute confidence difference to prior (+)
l lift lift value (confidence divided by prior) (+)
a liftdiff absolute difference of lift value to 1 (+)
q liftquot difference of lift quotient to 1 (+)
v cvct conviction (inverse lift for negated head) (+)
e cvctdiff absolute difference of conviction to 1 (+)
r cvctquot difference of conviction quotient to 1 (+)
k cprob conditional probability ratio (+)
j import importance (binary log. of prob. ratio) (+)
z cert certainty factor (relative conf. change) (+)
n chi2 normalized chi^2 measure (+)
p chi2pval p-value from (unnormalized) chi^2 measure (-)
y yates normalized chi^2 with Yates' correction (+)
t yatespval p-value from Yates-corrected chi^2 measure (-)
i info information difference to prior (+)
g infopval p-value from G statistic/info. difference (-)
f fetprob Fisher's exact test (table probability) (-)
h fetchi2 Fisher's exact test (chi^2 measure) (-)
m fetinfo Fisher's exact test (mutual information) (-)
s fetsupp Fisher's exact test (support) (-)
Measures marked with (+) must meet or exceed the threshold,
measures marked with (-) must not exceed the threshold
in order for the item set to be reported.
agg evaluation measure aggregation mode (default: x)
x none no aggregation (use first value)
m min minimum of individual measure values
n max maximum of individual measure values
a avg average of individual measure values
thresh threshold for evaluation measure (default: 10%)
prune min. size for evaluation filtering (default: no pruning)
= 0 backward filtering (no subset check)
< 0 weak forward filtering (one subset must qualify)
> 0 strong forward filtering (all subsets must qualify)
algo algorithm variant to use (default: a)
b basic standard algorithm (only choice)
mode operation mode indicators/flags (default: None)
x do not use perfect extension pruning
t/T do not organize transactions as a prefix tree
y a-posteriori pruning of infrequent item sets
z invalidate evaluation below expected support
o use original rule support definition (body & head)
border support border for filtering item sets (default: None)
Must be a list or tuple of (absolute) minimum support values
per item set size (by which the list/tuple is indexed).
appear dictionary mapping items to item appearance indicators,
with the key None referring to the default item appearance.
(If None does not occur as a key or no dictionary is given,
the default item appearance indicator is 'both'.)
This parameter is only used if the target type is rules.
* item may not appear anywhere in a rule:
'-', 'n', 'none', 'neither', 'ignore'
* item may appear only in rule body/antecedent:
'i', 'in', 'inp', 'input', 'b', 'body',
'a', 'ante', 'antecedent'
* item may appear only in rule head/consequent:
'o', 'out', 'output', 'h', 'head',
'c', 'cons', 'consequent'
* item may appear anywhere in a rule:
'io', 'i&o', 'inout', 'in&out', 'bh', 'b&h', 'both'
returns if report is not in ['#','=','|']:
if the target is association rules:
a list of rules (i.e. tuples with two or more elements),
each consisting of a head/consequent item, a tuple with
a body/antecedent item set, and the values selected by
the parameter 'report', which may be combined into a
tuple or a list if report[0] is '(' or '[', respectively.
if the target is a type of item sets:
a list of patterns (i.e. tuples with one or more elements),
each consisting of a tuple with a found frequent item set
and the values selected by the parameter 'report', which
may be combined into a tuple or list if report[0] is '('
or '[', respectively
if report in ['#','=','|']:
a pattern spectrum as a dictionary mapping pattern sizes
to the corresponding occurrence support ranges, as a list
of triplets (size, min. support, max. support) or as three
columns for sizes and minimum and maximum support values
itemsets = apriori(X, target='s', supp=2, zmin=2, report='S')
itemsets[:5]
[(('8.0_Valence', 'Not Polysemy'), 2.1785561725758225),
(('6.0_SemSize', 'Not Polysemy'), 2.2212729602733874),
(('11_Lenght', 'Not Polysemy'), 2.3067065356685177),
(('1.0_Masculinity', 'Not Polysemy'), 2.285348141819735),
(('2.0_Dominance', 'Not Polysemy'), 2.5630072618539086)]
# frequent itemset
itemsets[0][0]
('8.0_Valence', 'Not Polysemy')
# support
itemsets[0][1]
2.1785561725758225
itemsets = apriori(X, target='m', supp=2, zmin=2, report='S')
itemsets[:5]
[(('8.0_Valence', 'Not Polysemy'), 2.1785561725758225),
(('6.0_SemSize', 'Not Polysemy'), 2.2212729602733874),
(('11_Lenght', 'Not Polysemy'), 2.3067065356685177),
(('1.0_Masculinity', 'Not Polysemy'), 2.285348141819735),
(('2.0_Dominance', 'Not Polysemy'), 2.5630072618539086)]
itemsets[0][0]
('8.0_Valence', 'Not Polysemy')
df_pm.describe()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | polysemy | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4682 | 4682 | 4682 | 4682 | 4682 | 4682 | 4682 | 4682 | 4682 | 4682 | 4682 |
| unique | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 2 | 7 | 4 |
| top | 1.0_Lenght | 1.0_Arousal | 2.0_Valence | 1.0_Dominance | 2.0_Familiarity | 2.0_Age_of_Aquisition | 2.0_SemSize | 2.0_Masculinity | Not Polysemy | 6.0_Web_Corpus_Freq | 1.0_Perceivability |
| freq | 3407 | 2905 | 2684 | 2300 | 2964 | 2113 | 2299 | 2625 | 4303 | 2122 | 1940 |
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 8+1):
max_itemsets = apriori(X, target='m', supp=i, zmin=3)
cl_itemsets = apriori(X, target='c', supp=i, zmin=3)
all_itemsets = apriori(X, target='s', supp=i, zmin=3)
len_max_it.append( len(max_itemsets) )
len_cl_it.append( len(cl_itemsets) )
len_all_it.append( len(all_itemsets) )
plt.plot(len_max_it, label='maximal')
plt.plot(len_all_it, label='all')
plt.plot(len_cl_it, label='closed')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('%support', fontsize=15)
plt.show()
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 6+1):
max_itemsets = apriori(X, target='m', supp=2, zmin=i)
cl_itemsets = apriori(X, target='c', supp=2, zmin=i)
all_itemsets = apriori(X, target='s', supp=2, zmin=i)
len_max_it.append( len(max_itemsets) )
len_cl_it.append( len(cl_itemsets) )
len_all_it.append( len(all_itemsets) )
plt.plot(len_max_it, label='maximal')
plt.plot(len_cl_it, label='closed')
plt.plot(len_all_it, label='all')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('#zmin', fontsize=15)
plt.show()
df_no_pol=df_pm.drop('polysemy',axis=1)
df_pm.head()
| length | arousal | valence | dominance | familiarity | aoa | semsize | masculinity | web_corpus_freq | perceivability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8_Lenght | 4.0_Arousal | 2.0_Valence | 4.0_Dominance | 1.0_Familiarity | 10.0_Age_of_Aquisition | 6.0_SemSize | 7.0_Masculinity | 5.0_Web_Corpus_Freq | 6.0_Perceivability |
| 1 | 5_Lenght | 2.0_Arousal | 6.0_Valence | 4.0_Dominance | 3.0_Familiarity | 7.0_Age_of_Aquisition | 7.0_SemSize | 4.0_Masculinity | 6.0_Web_Corpus_Freq | 7.0_Perceivability |
| 2 | 10_Lenght | 2.0_Arousal | 6.0_Valence | 5.0_Dominance | 7.0_Familiarity | 8.0_Age_of_Aquisition | 2.0_SemSize | 5.0_Masculinity | 5.0_Web_Corpus_Freq | 3.0_Perceivability |
| 3 | 8_Lenght | 3.0_Arousal | 4.0_Valence | 4.0_Dominance | 4.0_Familiarity | 9.0_Age_of_Aquisition | 6.0_SemSize | 7.0_Masculinity | 5.0_Web_Corpus_Freq | 2.0_Perceivability |
| 4 | 10_Lenght | 3.0_Arousal | 4.0_Valence | 4.0_Dominance | 3.0_Familiarity | 9.0_Age_of_Aquisition | 7.0_SemSize | 6.0_Masculinity | 5.0_Web_Corpus_Freq | 2.0_Perceivability |
X_no_pol = df_no_pol.values.tolist()
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 5+1):
max_itemsets = apriori(X, target='m', supp=i, zmin=3)
cl_itemsets = apriori(X, target='c', supp=i, zmin=3)
all_itemsets = apriori(X, target='s', supp=i, zmin=3)
len_max_it.append( len(max_itemsets) )
len_cl_it.append( len(cl_itemsets) )
len_all_it.append( len(all_itemsets) )
plt.plot(len_all_it, label='all')
plt.plot(len_max_it, label='maximal')
plt.plot(len_cl_it, label='closed')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('%support', fontsize=15)
plt.show()
len_max_it = []
len_cl_it = []
len_all_it = []
for i in range(1, 6+1):
max_itemsets = apriori(X, target='m', supp=2, zmin=i)
cl_itemsets = apriori(X, target='c', supp=2, zmin=i)
all_itemsets = apriori(X, target='s', supp=2, zmin=i)
len_max_it.append( len(max_itemsets) )
len_cl_it.append( len(cl_itemsets) )
len_all_it.append( len(all_itemsets) )
plt.plot(len_all_it, label='all')
plt.plot(len_max_it, label='maximal')
plt.plot(len_cl_it, label='closed')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('%support', fontsize=15)
plt.show()
filter_0 = []
filter_1 = []
for i in range(2, 8+1):
max_itemsets = apriori(X, target='a', supp=i, zmin=3)
len_filter_0=0
len_filter_0+=len([item for item in max_itemsets if '0.0_Age_of_Aquisition' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '1.0_Age_of_Aquisition' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '2.0_Age_of_Aquisition' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '3.0_Age_of_Aquisition' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '4.0_Age_of_Aquisition' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '5.0_Age_of_Aquisition' in item[0]])
filter_0.append( len_filter_0 )
len_filter_1=0
len_filter_1+=len([item for item in max_itemsets if '6.0_Age_of_Aquisition' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '7.0_Age_of_Aquisition' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '8.0_Age_of_Aquisition' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '9.0_Age_of_Aquisition' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '10.0_Age_of_Aquisition' in item[0]])
filter_1.append( len_filter_1 )
plt.plot(filter_0, label='younger')
plt.plot(filter_1, label='older')
plt.legend(fontsize=15)
plt.xticks(fontsize=20)
plt.xlabel('%support', fontsize=15)
plt.show()
filter_0 = []
filter_1 = []
for i in range(2, 8+1):
max_itemsets = apriori(X, target='a', supp=i, zmin=3)
len_filter_0=0
len_filter_0+=len([item for item in max_itemsets if '0.0_Valence' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '1.0_Valence' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '2.0_Valence' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '3.0_Valence' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '4.0_Valence' in item[0]])
len_filter_0+=len([item for item in max_itemsets if '5.0_Valence' in item[0]])
filter_0.append( len_filter_0 )
len_filter_1=0
len_filter_1+=len([item for item in max_itemsets if '6.0_Valence' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '7.0_Valence' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '8.0_Valence' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '9.0_Valence' in item[0]])
len_filter_1+=len([item for item in max_itemsets if '10.0_Valence' in item[0]])
filter_1.append( len_filter_1 )
plt.plot(filter_0, label='no val')
plt.plot(filter_1, label='val')
plt.legend(fontsize=15)
plt.xticks(fontsize=20)
plt.xlabel('%support', fontsize=15)
plt.show()
rules = apriori(X_no_pol, target='r', supp=7, zmin=1, conf=75, report='aScl')
len(rules)
51
rules_supp = []
rules_zmin = []
rules_conf = []
supp_range=[6,8,10,12,14]
zmin_range=[1,2,3,4,5]
conf_range=[50,60,70,80,90]
for i,j,k in zip(supp_range,zmin_range,conf_range):
suppp = apriori(X, target='r', supp=i,conf=60)
zminn = apriori(X, target='r', zmin=j,conf=60,supp=8)
confi = apriori(X, target='r', conf=k,supp=8)
rules_supp.append( len(suppp) )
rules_zmin.append( len(zminn) )
rules_conf.append( len(confi) )
plt.plot(rules_supp, label='supp')
plt.plot(rules_zmin, label='zmin')
plt.plot(rules_conf, label='conf')
plt.legend(fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel('iteration', fontsize=15)
plt.show()
count=0
for i in range(len(rules)):
if rules[i][5]>1.6:
count+=1
print(rules[i][1], rules[i][0], rules[i][5])
print("\n")
print(count)
('0.0_Age_of_Aquisition', '3.0_Perceivability') 3.0_Familiarity 2.0291812958222906
('0.0_Age_of_Aquisition', '0.0_Lenght') 3.0_Familiarity 2.092629715676513
('0.0_Age_of_Aquisition', '2.0_Dominance') 3.0_Familiarity 2.15094549757535
('0.0_Age_of_Aquisition', '2.0_Valence') 3.0_Familiarity 2.0607110927721615
('0.0_Age_of_Aquisition',) 3.0_Familiarity 2.0683135425501837
('3.0_Valence', '2.0_Arousal') 2.0_Dominance 1.6851493862268596
('3.0_Valence', '2.0_SemSize') 2.0_Dominance 1.6761757612300272
('3.0_Perceivability', '7.0_Web_Corpus_Freq', '2.0_Valence') 3.0_Familiarity 1.7993229600606484
('3.0_Perceivability', '1.0_Age_of_Aquisition', '2.0_Dominance') 2.0_Valence 1.6206524147097918
('3.0_Perceivability', '0.0_Lenght', '2.0_Dominance') 2.0_Valence 1.6045936704181385
('3.0_Perceivability', '1.0_Masculinity') 2.0_Valence 1.6422164675005584
('3.0_Perceivability', '1.0_SemSize', '2.0_Dominance') 2.0_Valence 1.6413434838948537
('3.0_Perceivability', '1.0_SemSize', '1.0_Arousal') 2.0_Valence 1.6120887400184276
('3.0_Perceivability', '3.0_Familiarity', '2.0_Dominance') 2.0_Valence 1.621554705133399
('3.0_Perceivability', '3.0_Familiarity', '1.0_Arousal') 2.0_Valence 1.6759611065358988
('3.0_Perceivability', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.774768741956242
('3.0_Perceivability', '2.0_Dominance', '2.0_Masculinity') 2.0_Valence 1.6559708342796577
('7.0_Web_Corpus_Freq', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.739711516684282
('1.0_Age_of_Aquisition', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6366788663663663
('1.0_Masculinity', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6379451670168828
('1.0_SemSize', '3.0_Familiarity', '2.0_Dominance') 2.0_Valence 1.6112367991384702
('1.0_SemSize', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6667974426038943
('1.0_SemSize', '2.0_Dominance', '1.0_Lenght') 2.0_Valence 1.6243252813720235
('3.0_Familiarity', '2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.6774946657183498
('2.0_Dominance', '1.0_Arousal', '2.0_Masculinity') 2.0_Valence 1.6309943211694184
('2.0_Dominance', '1.0_Arousal', '1.0_Lenght') 2.0_Valence 1.615561697383268
('2.0_Dominance', '1.0_Arousal') 2.0_Valence 1.600871642034784
27
dom1=[r for r in rules if r[0] == '1.0_Dominance']
val2=[r for r in rules if r[0] == '2.0_Valence']
print(dom1)
[('1.0_Dominance', ('1.0_Valence', '1.0_Perceivability', '1.0_Lenght'), 401, 8.56471593336181, 0.8285123966942148, 1.6865630614444844), ('1.0_Dominance', ('1.0_Valence', '1.0_Perceivability'), 528, 11.277231952157198, 0.822429906542056, 1.6741812271434375), ('1.0_Dominance', ('1.0_Valence', '1.0_SemSize', '1.0_Arousal'), 435, 9.29090132422042, 0.8285714285714286, 1.6866832298136647), ('1.0_Dominance', ('1.0_Valence', '1.0_SemSize', '1.0_Lenght'), 411, 8.778299871849637, 0.8187250996015937, 1.666639528841157), ('1.0_Dominance', ('1.0_Valence', '1.0_SemSize'), 555, 11.853908586074327, 0.8149779735682819, 1.65901168358552), ('1.0_Dominance', ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal', '1.0_Lenght'), 405, 8.650149508756941, 0.805168986083499, 1.6390439968882358), ('1.0_Dominance', ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal'), 552, 11.78983340452798, 0.8165680473372781, 1.6622485207100592), ('1.0_Dominance', ('1.0_Valence', '1.0_Arousal', '2.0_Familiarity'), 459, 9.8035027765912, 0.8038528896672504, 1.6363648823574202), ('1.0_Dominance', ('1.0_Valence', '1.0_Arousal', '1.0_Lenght'), 549, 11.725758222981632, 0.8133333333333334, 1.655663768115942), ('1.0_Dominance', ('1.0_Valence', '1.0_Arousal'), 751, 16.040153780435713, 0.8189749182115594, 1.667148072637618)]
for el in range(len(dom1)):
print( 'to_predict:', [r for r in rules if r[0] == '1.0_Dominance'][el][0])
print( 'how?', [r for r in rules if r[0] == '1.0_Dominance'][el][1])
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Perceivability', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Perceivability')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_SemSize', '1.0_Arousal')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_SemSize', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_SemSize')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '2.0_Masculinity', '1.0_Arousal')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Arousal', '2.0_Familiarity')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Arousal', '1.0_Lenght')
to_predict: 1.0_Dominance
how? ('1.0_Valence', '1.0_Arousal')
for el in range(len(val2)):
print( 'to_predict:', [r for r in rules if r[0] == '2.0_Valence'][el][0])
print( 'how?', [r for r in rules if r[0] == '2.0_Valence'][el][1])
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Age_of_Aquisition', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Age_of_Aquisition', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Age_of_Aquisition')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_SemSize')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '2.0_Dominance')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Arousal', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('3.0_Perceivability', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('3.0_Perceivability',)
to_predict: 2.0_Valence
how? ('1.0_Age_of_Aquisition', '1.0_SemSize', '2.0_Dominance')
to_predict: 2.0_Valence
how? ('1.0_Age_of_Aquisition', '2.0_Dominance', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance', '2.0_Familiarity')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('1.0_SemSize', '2.0_Dominance')
to_predict: 2.0_Valence
how? ('6.0_Web_Corpus_Freq', '2.0_Dominance', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '2.0_Masculinity', '1.0_Arousal', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '2.0_Masculinity', '1.0_Arousal')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal', '2.0_Familiarity', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal', '2.0_Familiarity')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal', '1.0_Lenght')
to_predict: 2.0_Valence
how? ('2.0_Dominance', '1.0_Arousal')
# <--
rules[0][0], rules[0][1]
('Not Polysemy', ('8_Lenght',))
# asbolute support
rules[0][2]
484
# support as percentage
rules[0][3]
10.337462622810763
# confidence
rules[0][4]
0.983739837398374
# lift
rules[0][5]
1.0703857584706453